In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns


#  Modelado y evaluación
# -----------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pickle
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_pickle("../data/bikes_registrados_estand_encod.pkl")
df.head()

Unnamed: 0,año,mes,festivo,dia,meteo,temperatura,humedad,velo_viento,usu_registrados
0,0,0,0,0,1,-0.486274,0.858854,-0.208909,654
1,0,0,1,0,1,-0.42569,0.331548,0.68755,670
2,0,0,1,1,2,-0.94997,-0.912,0.68521,1229
3,0,0,1,1,2,-0.938563,-0.176088,-0.210435,1454
4,0,0,1,1,2,-0.853992,-0.913519,0.060294,1518


In [3]:
X = df.drop("usu_registrados", axis = 1)
y = df["usu_registrados"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 15)

arbol = DecisionTreeRegressor(random_state =15)

arbol.fit(x_train, y_train)

In [4]:
with open ("../data/dt_registrados.pkl", "wb") as f:
    pickle.dump(arbol, f)

In [5]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [6]:
y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [7]:
max_features = np.sqrt(len(x_train.columns))
max_features

2.8284271247461903

In [8]:
print(arbol.tree_.max_depth)

20


In [9]:
param = {"max_depth": [2, 3, 4],
         "max_features": [2, 3, 4],
         "min_samples_split": [5, 7],
         "min_samples_leaf": [5, 7]} 

In [10]:
gs = GridSearchCV(estimator=DecisionTreeRegressor(), 
                  param_grid= param,
                  cv=10,
                  verbose=0,
                  return_train_score = True,
                  scoring="neg_mean_squared_error")

In [11]:
gs.fit(x_train, y_train)

In [12]:
with open ("../data/dt_registrados_mejor_modelo.pkl", "wb") as f:
    pickle.dump(gs, f)

In [13]:
mejor_modelo = gs.best_estimator_
mejor_modelo

In [14]:
dt_results = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "DT_registrados_estad_encode")
dt_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,588.226027,756354.746575,869.68658,0.718272,test,DT_registrados_estad_encode
1,0.0,0.0,0.0,1.0,train,DT_registrados_estad_encode


___
Vamos a probar el modelo sin estandarizar:

Hemos probado a mejorar las métricas probando con los datos sin estandarizar, incluyendo temperatura y sensación térmica e incluyendo solo temperatura, pero los datos siguen overfitteados.

In [33]:
df2 = pd.read_pickle("../data/bikes_registrados_encod_sin_estand.pkl")
df2.head()

Unnamed: 0,año,mes,festivo,dia,meteo,temperatura,sensacion_termica,humedad,velo_viento,usu_registrados
0,0,0,0,0,1,14.110847,18.18125,80.5833,10.749882,654
1,0,0,1,0,1,14.902598,17.68695,69.6087,16.652113,670
2,0,0,1,1,2,8.050924,9.47025,43.7273,16.636703,1229
3,0,0,1,1,2,8.2,10.6061,59.0435,10.739832,1454
4,0,0,1,1,2,9.305237,11.4635,43.6957,12.5223,1518


In [41]:
df2.drop("sensacion_termica", axis= 1, inplace=True)

In [42]:
X2 = df2.drop("usu_registrados", axis = 1)
y2 = df2["usu_registrados"]

x2_train, x2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.2, random_state = 15)

arbol2 = DecisionTreeRegressor(random_state =15)

arbol2.fit(x2_train, y2_train)

In [43]:
y2_pred_test_dt = arbol2.predict(x2_test)
y2_pred_train_dt = arbol2.predict(x2_train)

In [44]:
param = {"max_depth": [2, 3, 4],
         "max_features": [2, 3, 4],
         "min_samples_split": [5, 7],
         "min_samples_leaf": [5, 7]}

In [45]:
gs2 = GridSearchCV(estimator=DecisionTreeRegressor(), 
                  param_grid= param,
                  cv=10,
                  verbose=0,
                  return_train_score = True,
                  scoring="neg_mean_squared_error")

In [46]:
gs2.fit(x2_train, y2_train)

In [47]:
mejor_modelo2 = gs2.best_estimator_
mejor_modelo2

In [48]:
dt_results2 = metricas(y2_test, y2_train, y2_pred_test_dt, y2_pred_train_dt, "DT_registrados_encode_sin_estand")
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,588.226027,756354.746575,869.68658,0.718272,test,DT_registrados_encode_sin_estand
1,0.0,0.0,0.0,1.0,train,DT_registrados_encode_sin_estand


___
Vamos a probar con random forest sobre usuarios registrados y estandarizados:

In [57]:
df = pd.read_pickle("../data/bikes_registrados_estand_encod_completo.pkl")
df.head()

Unnamed: 0,estacion,año,mes,festivo,dia,meteo,temperatura,humedad,velo_viento,usu_registrados,laborable_laborable,laborable_no laborable
0,0,0,0,0,0,1,-0.486274,0.858854,-0.208909,654,0,1
1,0,0,0,1,0,1,-0.42569,0.331548,0.68755,670,1,0
2,0,0,0,1,1,2,-0.94997,-0.912,0.68521,1229,1,0
3,0,0,0,1,1,2,-0.938563,-0.176088,-0.210435,1454,1,0
4,0,0,0,1,1,2,-0.853992,-0.913519,0.060294,1518,1,0


In [58]:
X = df.drop("usu_registrados", axis = 1)
y = df["usu_registrados"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 15)

In [59]:
param = {"max_depth": [2, 3, 4],
         "max_features": [2, 3, 4],
         "min_samples_split": [5, 7],
         "min_samples_leaf": [5, 7]} 

In [60]:
gs_rf = GridSearchCV(estimator=RandomForestRegressor(), 
                     param_grid= param, #
                     cv=10, 
                     verbose=0, 
                     return_train_score = True, 
                     scoring="neg_mean_squared_error")

In [61]:
gs_rf.fit(x_train, y_train)

In [62]:
bosque = gs_rf.best_estimator_
bosque

In [63]:
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [65]:
dt_results3 = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "RF_registrados_estad_encode")
dt_results3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,631.813089,706335.757089,840.437837,0.736903,test,RF_registrados_estad_encode
1,531.947463,461484.908296,679.326805,0.804323,train,RF_registrados_estad_encode
