In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


In [2]:
df_cnt = pd.read_pickle('df_cnt_prep.pkl')

In [3]:
df_cnt.head()

Unnamed: 0,month,year,weekday,holiday,weathersit,cnt,temp,hum,windspeed,workingday_0,workingday_1
0,5.69,4,13.82,3,3,985,-0.827613,1.252343,-0.387833,1,0
1,5.95,4,14.54,4,3,801,-0.722069,0.480996,0.748899,0,1
2,8.13,4,14.54,4,4,1349,-1.635432,-1.338073,0.745931,0,1
3,8.47,4,14.37,4,4,1562,-1.61556,-0.261577,-0.389769,1,0
4,9.54,4,13.98,4,4,1600,-1.468226,-1.340294,-0.046477,0,1


### DECISIÓN TREE

In [4]:
X = df_cnt.drop("cnt", axis = 1)
y = df_cnt["cnt"]

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 33)

In [6]:
arbol = DecisionTreeRegressor(random_state = 33)

arbol.fit(x_train, y_train)

In [7]:
max_features = np.sqrt(len(x_train.columns))
max_features

3.1622776601683795

In [8]:
print(arbol.tree_.max_depth)

25


In [9]:
y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [12]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):  
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [11]:
dt_results1 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decission Tree cnt I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,761.890411,1000460.0,1000.230097,0.732622,test,Decission Tree cnt I
1,0.0,0.0,0.0,1.0,train,Decission Tree cnt I


1. Ajustamos parámetros:

In [12]:
param = {"max_depth": [10, 11, 12, 13, 14],
        "max_features": [2,3,4],
        "min_samples_split": [ 10, 15, 20],
        "min_samples_leaf": [10, 15, 20]} 

In [13]:
gs = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param,
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

In [14]:
gs.fit(x_train, y_train)

In [15]:
mejor_cnt1 = gs.best_estimator_
mejor_cnt1

In [16]:
y_pred_test_dt2 = mejor_cnt1.predict(x_test)
y_pred_train_dt2 = mejor_cnt1.predict(x_train)

In [17]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision Tree cnt II")
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,912.57431,1342216.0,1158.540568,0.641285,test,Decision Tree cnt II
1,808.983467,1152518.0,1073.553784,0.692057,train,Decision Tree cnt II


Menos overfitting, pero aún malas métricas.

In [18]:
param2 = {"max_depth": [10, 11, 12],
        "max_features": [3, 4],
        "min_samples_split": [10, 15, 20, 25, 30],
        "min_samples_leaf": [10, 15, 20]} 

In [19]:
gs2 = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param2,
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

In [20]:
gs2.fit(x_train, y_train)

In [21]:
mejor_cnt2 = gs2.best_estimator_
mejor_cnt2

In [22]:
y_pred_test_dt3 = mejor_cnt2.predict(x_test)
y_pred_train_dt3 = mejor_cnt2.predict(x_train)

In [23]:
dt_results3 = metricas(y_test, y_train, y_pred_test_dt3, y_pred_train_dt3, "Decision tree cnt III")
dt_results3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,879.143369,1171074.0,1082.16173,0.687024,test,Decision tree cnt III
1,734.379695,873604.2,934.667941,0.76658,train,Decision tree cnt III


In [24]:
param3 = {"max_depth": [11, 12],
        "max_features": [3,4],
        "min_samples_split": [15, 20, 15],
        "min_samples_leaf": [10, 15, 20]} 

In [25]:
gs3 = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param3,
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

In [26]:
gs3.fit(x_train, y_train)

In [27]:
mejor_cnt3 = gs3.best_estimator_
mejor_cnt3

In [28]:
y_pred_test_dt4 = mejor_cnt3.predict(x_test)
y_pred_train_dt4 = mejor_cnt3.predict(x_train)

In [29]:
dt_results4 = metricas(y_test, y_train, y_pred_test_dt4, y_pred_train_dt4, "Decision tree cnt IV")
dt_results4

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,646.929311,663722.121069,814.691427,0.822617,test,Decision tree cnt IV
1,593.868468,646693.85148,804.172775,0.827209,train,Decision tree cnt IV


In [30]:
dt_cnt = pd.concat([dt_results1,dt_results2,dt_results3,dt_results4], axis=0)

In [31]:
dt_cnt

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,761.890411,1000460.0,1000.230097,0.732622,test,Decission Tree cnt I
1,0.0,0.0,0.0,1.0,train,Decission Tree cnt I
0,912.57431,1342216.0,1158.540568,0.641285,test,Decision Tree cnt II
1,808.983467,1152518.0,1073.553784,0.692057,train,Decision Tree cnt II
0,879.143369,1171074.0,1082.16173,0.687024,test,Decision tree cnt III
1,734.379695,873604.2,934.667941,0.76658,train,Decision tree cnt III
0,646.929311,663722.1,814.691427,0.822617,test,Decision tree cnt IV
1,593.868468,646693.9,804.172775,0.827209,train,Decision tree cnt IV


### RANDOM FOREST

In [47]:
param4 = {"max_depth": [10, 11, 12],
        "max_features": [3,4, 5],
        "min_samples_split": [10, 13, 16, 19],
        "min_samples_leaf": [8, 10, 12, 14]} 

In [48]:
gs_rf1 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param4, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [49]:
gs_rf1.fit(x_train, y_train)

In [50]:
bosque = gs_rf1.best_estimator_
bosque

In [51]:
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [52]:
rf_results1 = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest cnt I")
rf_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,599.472794,546563.111638,739.299068,0.853928,test,Random Forest cnt I
1,534.808785,496956.789879,704.951622,0.867217,train,Random Forest cnt I


In [53]:
param5 = {"max_depth": [10, 11, 12, 13],
        "max_features": [4,5, 6],
        "min_samples_split": [8, 9, 10, 11, 12],
        "min_samples_leaf": [6, 7, 8, 9, 10]} 

In [54]:
gs_rf2 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param5, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [55]:
gs_rf2.fit(x_train, y_train)

In [56]:
bosque2 = gs_rf2.best_estimator_
bosque2

In [57]:
y_pred_test_rf2 = bosque2.predict(x_test)
y_pred_train_rf2 = bosque2.predict(x_train)

In [58]:
rf_results2 = metricas(y_test, y_train, y_pred_test_rf2, y_pred_train_rf2, "Random Forest cnt II")
rf_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,580.577284,513772.207496,716.779051,0.862692,test,Random Forest cnt II
1,481.413071,418993.163801,647.296813,0.888048,train,Random Forest cnt II


In [59]:
param6 = {"max_depth": [9, 10, 11],
        "max_features": [4, 5, 6, 7],
        "min_samples_split": [8, 9, 10, 11],
        "min_samples_leaf": [6, 7, 8, 9]} 

In [61]:
gs_rf3 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param6, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [62]:
gs_rf3.fit(x_train, y_train)

In [63]:
bosque3 = gs_rf3.best_estimator_
bosque3

In [64]:
y_pred_test_rf3 = bosque3.predict(x_test)
y_pred_train_rf3 = bosque3.predict(x_train)

In [65]:
rf_results3 = metricas(y_test, y_train, y_pred_test_rf3, y_pred_train_rf3, "Random Forest cnt III")
rf_results3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,569.601797,500715.53972,707.612563,0.866181,test,Random Forest cnt III
1,479.244534,420213.739254,648.238952,0.887722,train,Random Forest cnt III


In [6]:
param7 = {"max_depth": [10, 11, 12],
        "max_features": [4, 5, 6],
        "min_samples_split": [10, 11, 12, 13],
        "min_samples_leaf": [4, 5, 6, 7]} 

In [7]:
gs_rf4 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param7, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [8]:
gs_rf4.fit(x_train, y_train)

In [9]:
bosque4 = gs_rf4.best_estimator_
bosque4

In [10]:
y_pred_test_rf4 = bosque4.predict(x_test)
y_pred_train_rf4 = bosque4.predict(x_train)

In [13]:
rf_results4 = metricas(y_test, y_train, y_pred_test_rf4, y_pred_train_rf4, "Random Forest cnt III")
rf_results4

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,582.379506,519547.790704,720.796636,0.861148,test,Random Forest cnt III
1,470.301448,387083.240872,622.160141,0.896575,train,Random Forest cnt III


In [14]:
param8 = {"max_depth": [10, 11],
        "max_features": [5, 6],
        "min_samples_split": [8, 9, 10, 11, 12],
        "min_samples_leaf": [4, 5, 6, 7, 8, 9, 10]} 

In [15]:
gs_rf5 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param8, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [16]:
gs_rf5.fit(x_train, y_train)

In [17]:
bosque5 = gs_rf5.best_estimator_
bosque5

In [18]:
y_pred_test_rf5 = bosque5.predict(x_test)
y_pred_train_rf5 = bosque5.predict(x_train)

In [19]:
rf_results5 = metricas(y_test, y_train, y_pred_test_rf5, y_pred_train_rf5, "Random Forest cnt III")
rf_results5

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,581.594738,515301.05736,717.844731,0.862283,test,Random Forest cnt III
1,462.426156,380042.735375,616.476062,0.898456,train,Random Forest cnt III


-----------------------------------------------------

In [22]:
rf_cnt = pd.concat([ rf_results4,rf_results5], axis = 0)

In [25]:
rf_cnt

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,582.379506,519547.790704,720.796636,0.861148,test,Random Forest cnt III
1,470.301448,387083.240872,622.160141,0.896575,train,Random Forest cnt III
0,581.594738,515301.05736,717.844731,0.862283,test,Random Forest cnt III
1,462.426156,380042.735375,616.476062,0.898456,train,Random Forest cnt III


In [24]:
rf_cnt.to_csv('result_RF_totales.csv')

In [26]:
import pickle

In [27]:
with open ("mejor_modelo.pkl", "wb") as f:
    pickle.dump(bosque5, f)