# Processamento individual - Random Forest

In [2]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import make_scorer
from sklearn.metrics import (
        mean_absolute_error,
        mean_squared_error,
        mean_squared_log_error,
        r2_score,
)
%matplotlib inline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [3]:
df = pd.read_csv("../input/Occupancy_Estimation_cleaned_outliers.csv")

In [4]:
METRICS = {
        "MAE": make_scorer(mean_absolute_error),
        "MSE": make_scorer(mean_squared_error,squared=True),
        "RMSE": make_scorer(mean_squared_error,squared=False),
        "r2_Score": make_scorer(r2_score),
        "RMSLE": make_scorer(mean_squared_log_error,squared=False)
}

X, y = df.drop("Room_Occupancy_Count", axis=1), df["Room_Occupancy_Count"]

splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
#Função de kfold com 10 iterações, garante que os resultados são iguais

rf = RandomForestRegressor(max_depth=3, random_state=1234)
scores = cross_validate(rf, X, y, cv=splitter, scoring=METRICS)
rf_scores = pd.DataFrame(scores)
pd.DataFrame(rf_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_RMSE,test_r2_Score,test_RMSLE
0,0.353381,0.009281,0.050714,0.044551,0.209968,0.9372,0.089815


# Cenário 3 - Normalização - (0,1)

In [5]:
df_norm = df.copy()

In [6]:
scaler = MinMaxScaler()
min_max_scaler = MinMaxScaler(feature_range=(0,1))

dict_scalers = {}

columns = list(df_norm.columns)
columns.remove('Room_Occupancy_Count')

for col in columns:
    # fit
    scaler_col = scaler.fit(df_norm[[col]])
    # transform
    df_norm[[col]] = scaler.transform(df_norm[[col]])
    # guardar scaler no dict
    dict_scalers[col] = scaler_col

df_norm.head()

Unnamed: 0,S1_Temp,S1_Light,S1_Sound,S5_CO2,S6_PIR,Room_Occupancy_Count,day_of_month,week_day,month,year,hour,minute
0,0.0,0.733333,0.012942,0.0,0.0,1,0.75,0.666667,1.0,0.0,0.434783,0.830508
1,0.0,0.733333,0.219839,0.0,0.0,1,0.75,0.666667,1.0,0.0,0.434783,0.847458
2,0.045802,0.733333,0.085791,0.0,0.0,1,0.75,0.666667,1.0,0.0,0.434783,0.847458
3,0.045802,0.733333,0.080429,0.0,0.0,1,0.75,0.666667,1.0,0.0,0.434783,0.864407
4,0.045802,0.733333,0.018767,0.0,0.0,1,0.75,0.666667,1.0,0.0,0.434783,0.864407


In [7]:
Xn, yn = df_norm.drop("Room_Occupancy_Count", axis=1), df_norm["Room_Occupancy_Count"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(Xn, yn, test_size=0.2, random_state=1234)

rf_norm = RandomForestRegressor(max_depth=3, random_state=1234)

# Use the METRICS dictionary as the scoring parameter
scoring = METRICS

# Perform cross-validation on the scaled data
scores = cross_validate(rf_norm, Xn, yn, cv=splitter, scoring=scoring)

rf_scores = pd.DataFrame(scores)
pd.DataFrame(rf_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_RMSE,test_r2_Score,test_RMSLE
0,0.346922,0.009467,0.050714,0.044551,0.209968,0.9372,0.089815


## Feature Importance

O objetivo é comprovar que a nossa seleção estava correta.

Para isso, vamos focar-nos nas features de condições ambientais e verficar a sua importancia.

In [9]:
df_final = df.copy()
df_final = df_final.drop(columns=['day_of_month', 'year','month','week_day','hour'])
Xf, yf = df_final.drop("Room_Occupancy_Count", axis=1), df_final["Room_Occupancy_Count"]
rf_final = RandomForestRegressor(max_depth=3, random_state=1234)

# Use the METRICS dictionary as the scoring parameter
scoring = METRICS

# Perform cross-validation on the scaled data
scores = cross_validate(rf_final, Xf, yf, cv=splitter, scoring=scoring)

rf_scores = pd.DataFrame(scores)
pd.DataFrame(rf_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_RMSE,test_r2_Score,test_RMSLE
0,0.262521,0.009695,0.050766,0.044627,0.210119,0.937092,0.089853


In [10]:
rf_final.fit(Xf, yf)

# Obter a importância de cada feature
importances = rf_final.feature_importances_

column_names = Xf.columns
## feature importance
feature_importances = pd.DataFrame(importances, index = column_names, columns=['importance'])
feature_importances.sort_values(by='importance', ascending=False).head(10)

Unnamed: 0,importance
S1_Light,0.703806
S1_Sound,0.205596
S5_CO2,0.090598
S1_Temp,0.0
S6_PIR,0.0
minute,0.0


Assim sendo, observa-se que as features mais importantes foram as selecionadas.