In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


In [3]:
df_cnt = pd.read_pickle('df_cnt_prep.pkl')

In [4]:
df_cnt.head()

Unnamed: 0,month,year,weekday,holiday,weathersit,cnt,temp,hum,windspeed,workingday_0,workingday_1
0,5.69,4,13.82,3,3,985,-0.827613,1.252343,-0.387833,1,0
1,5.95,4,14.54,4,3,801,-0.722069,0.480996,0.748899,0,1
2,8.13,4,14.54,4,4,1349,-1.635432,-1.338073,0.745931,0,1
3,8.47,4,14.37,4,4,1562,-1.61556,-0.261577,-0.389769,1,0
4,9.54,4,13.98,4,4,1600,-1.468226,-1.340294,-0.046477,0,1


### DECISIÓN TREE

In [5]:
X = df_cnt.drop("cnt", axis = 1)
y = df_cnt["cnt"]

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)

In [8]:
arbol = DecisionTreeRegressor(random_state = 33)

arbol.fit(x_train, y_train)

In [9]:
max_features = np.sqrt(len(x_train.columns))
max_features

3.1622776601683795

In [10]:
print(arbol.tree_.max_depth)

21


In [11]:
y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [12]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):  
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [13]:
dt_results1 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decission Tree cnt I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,707.075342,999367.691781,999.683796,0.710931,test,Decission Tree cnt I
1,0.0,0.0,0.0,1.0,train,Decission Tree cnt I


1. Ajustamos parámetros:

In [50]:
param = {"max_depth": [10, 11, 12, 13, 14],
        "max_features": [2,3,4],
        "min_samples_split": [ 10, 15, 20],
        "min_samples_leaf": [10, 15, 20]} 

In [51]:
gs = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param,
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

In [52]:
gs.fit(x_train, y_train)

In [53]:
mejor_cnt1 = gs.best_estimator_
mejor_cnt1

In [54]:
y_pred_test_dt2 = mejor_cnt1.predict(x_test)
y_pred_train_dt2 = mejor_cnt1.predict(x_train)

In [55]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision Tree cnt II")
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,784.67071,1055101.0,1027.181181,0.69481,test,Decision Tree cnt II
1,676.617725,827314.4,909.568267,0.78304,train,Decision Tree cnt II


Menos overfitting, pero aún malas métricas.

In [62]:
param2 = {"max_depth": [10, 11, 12],
        "max_features": [3, 4],
        "min_samples_split": [10, 15, 20, 25, 30],
        "min_samples_leaf": [10, 15, 20]} 

In [63]:
gs2 = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param2,
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

In [64]:
gs2.fit(x_train, y_train)

In [65]:
mejor_cnt2 = gs2.best_estimator_
mejor_cnt2

In [66]:
y_pred_test_dt3 = mejor_cnt2.predict(x_test)
y_pred_train_dt3 = mejor_cnt2.predict(x_train)

In [67]:
dt_results3 = metricas(y_test, y_train, y_pred_test_dt3, y_pred_train_dt3, "Decision tree cnt III")
dt_results3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,754.941806,970661.433693,985.221515,0.719234,test,Decision tree cnt III
1,581.109847,616534.131659,785.196874,0.838317,train,Decision tree cnt III


In [68]:
param3 = {"max_depth": [11, 12],
        "max_features": [3,4],
        "min_samples_split": [15, 20, 15],
        "min_samples_leaf": [10, 15, 20]} 

In [69]:
gs3 = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param3,
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

In [70]:
gs3.fit(x_train, y_train)

In [71]:
mejor_cnt3 = gs3.best_estimator_
mejor_cnt3

In [72]:
y_pred_test_dt4 = mejor_cnt3.predict(x_test)
y_pred_train_dt4 = mejor_cnt3.predict(x_train)

In [73]:
dt_results4 = metricas(y_test, y_train, y_pred_test_dt4, y_pred_train_dt4, "Decision tree cnt IV")
dt_results4

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,825.888987,1247907.0,1117.097536,0.63904,test,Decision tree cnt IV
1,687.974746,880910.9,938.568563,0.768985,train,Decision tree cnt IV


In [98]:
dt_cnt = pd.concat([dt_results1,dt_results2,dt_results3,dt_results4], axis=0)

In [99]:
dt_cnt

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,707.075342,999367.7,999.683796,0.710931,test,Decission Tree cnt I
1,0.0,0.0,0.0,1.0,train,Decission Tree cnt I
0,784.67071,1055101.0,1027.181181,0.69481,test,Decision Tree cnt II
1,676.617725,827314.4,909.568267,0.78304,train,Decision Tree cnt II
0,754.941806,970661.4,985.221515,0.719234,test,Decision tree cnt III
1,581.109847,616534.1,785.196874,0.838317,train,Decision tree cnt III
0,825.888987,1247907.0,1117.097536,0.63904,test,Decision tree cnt IV
1,687.974746,880910.9,938.568563,0.768985,train,Decision tree cnt IV


### RANDOM FOREST

In [76]:
param4 = {"max_depth": [10, 11, 12],
        "max_features": [3,4],
        "min_samples_split": [15, 20, 25],
        "min_samples_leaf": [10, 15, 20]} 

In [77]:
gs_rf1 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param4, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [78]:
gs_rf1.fit(x_train, y_train)

In [79]:
bosque = gs_rf1.best_estimator_
bosque

In [80]:
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [81]:
rf_results1 = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest cnt I")
rf_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,645.709196,704344.855958,839.252558,0.796267,test,Random Forest cnt I
1,586.563458,559854.432543,748.23421,0.853181,train,Random Forest cnt I


In [82]:
param5 = {"max_depth": [11, 12, 13],
        "max_features": [3,4,5],
        "min_samples_split": [10, 15, 20],
        "min_samples_leaf": [10, 12, 14, 16]} 

In [83]:
gs_rf2 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param5, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [84]:
gs_rf2.fit(x_train, y_train)

In [85]:
bosque2 = gs_rf2.best_estimator_
bosque2

In [86]:
y_pred_test_rf2 = bosque2.predict(x_test)
y_pred_train_rf2 = bosque2.predict(x_train)

In [87]:
rf_results2 = metricas(y_test, y_train, y_pred_test_rf2, y_pred_train_rf2, "Random Forest cnt II")
rf_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,605.338789,621238.10102,788.18659,0.820305,test,Random Forest cnt II
1,559.002969,531379.430329,728.95777,0.860648,train,Random Forest cnt II


In [88]:
param6 = {"max_depth": [11, 12, 13],
        "max_features": [3,4,5,6],
        "min_samples_split": [6, 8, 10],
        "min_samples_leaf": [6, 8, 10]} 

In [89]:
gs_rf3 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param6, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [90]:
gs_rf3.fit(x_train, y_train)

In [91]:
bosque3 = gs_rf3.best_estimator_
bosque3

In [92]:
y_pred_test_rf3 = bosque3.predict(x_test)
y_pred_train_rf3 = bosque3.predict(x_train)

In [93]:
rf_results3 = metricas(y_test, y_train, y_pred_test_rf3, y_pred_train_rf3, "Random Forest cnt III")
rf_results3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,569.547228,564993.77474,751.660678,0.836574,test,Random Forest cnt III
1,485.438895,411812.929389,641.726522,0.892004,train,Random Forest cnt III


In [94]:
param7 = {"max_depth": [11, 12],
        "max_features": [4,5, 6],
        "min_samples_split": [10, 12, 15],
        "min_samples_leaf": [6, 8, 10]} 

In [95]:
gs_rf4 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param7, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [96]:
gs_rf4.fit(x_train, y_train)

In [100]:
bosque4 = gs_rf4.best_estimator_
bosque4

In [101]:
y_pred_test_rf4 = bosque4.predict(x_test)
y_pred_train_rf4 = bosque4.predict(x_train)

In [102]:
rf_results4 = metricas(y_test, y_train, y_pred_test_rf4, y_pred_train_rf4, "Random Forest cnt III")
rf_results4

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,585.126489,595022.627025,771.377098,0.827888,test,Random Forest cnt III
1,489.678334,417235.758409,645.937891,0.890582,train,Random Forest cnt III


In [103]:
param8 = {"max_depth": [10, 11, 12],
        "max_features": [4, 5, 6],
        "min_samples_split": [6, 7, 8, 9, 10],
        "min_samples_leaf": [6, 7, 8, 9, 10]} 

In [104]:
gs_rf5 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param8, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [105]:
gs_rf5.fit(x_train, y_train)

In [106]:
bosque5 = gs_rf5.best_estimator_
bosque5

In [107]:
y_pred_test_rf5 = bosque5.predict(x_test)
y_pred_train_rf5 = bosque5.predict(x_train)

In [108]:
rf_results5 = metricas(y_test, y_train, y_pred_test_rf5, y_pred_train_rf5, "Random Forest cnt III")
rf_results5

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,578.824043,580745.645184,762.066693,0.832018,test,Random Forest cnt III
1,474.762978,407068.915878,638.019526,0.893248,train,Random Forest cnt III


-----------------------------------------------------

In [109]:
rf_cnt = pd.concat([rf_results1, rf_results2,rf_results3], axis = 0)

In [110]:
rf_cnt

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,645.709196,704344.855958,839.252558,0.796267,test,Random Forest cnt I
1,586.563458,559854.432543,748.23421,0.853181,train,Random Forest cnt I
0,605.338789,621238.10102,788.18659,0.820305,test,Random Forest cnt II
1,559.002969,531379.430329,728.95777,0.860648,train,Random Forest cnt II
0,569.547228,564993.77474,751.660678,0.836574,test,Random Forest cnt III
1,485.438895,411812.929389,641.726522,0.892004,train,Random Forest cnt III
