## Entrenamiento del modelo

### Importación de librerías

Datos y gráficos

In [113]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import yaml
import os

Preprocesado y modelado

In [114]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

In [115]:
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

### Warnings y display de la dataframe

In [116]:
pd.set_option("max_colwidth", 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import warnings
warnings.filterwarnings("ignore")

### Importación de dataframes

In [117]:
directorio_actual = os.getcwd()

In [118]:
'''Creo las X_train e y_train a partir de df_train.
   Creo las X_test e y_test a partir de df_test.
   Ambas previamente importado
   Por ultimo, creo las X e y con todo el train y test,
   para entrenar mimodelo final y generalizar mejor ante nuevos input.'''

df_train = pd.read_csv(os.path.join(directorio_actual, '..', 'data', 'df_train.csv'))
df_test = pd.read_csv(os.path.join(directorio_actual, '..', 'data', 'df_test.csv'))
df_num = pd.read_csv(os.path.join(directorio_actual, '..', 'data', 'df_num.csv'))

X_train = df_train.drop(['price'],axis=1)
y_train = df_train['price']
X_test = df_test.drop(['price'],axis=1)
y_test = df_test['price']
X = df_num.drop(['price'],axis=1)
y = df_num['price']

### Busqueda de regresor

Primero busco el regresor mas adecuado para predecir mi target

In [49]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=6)),
                 ('regresor', RandomForestRegressor())])

DTR_param = {
    'classifier': [DecisionTreeRegressor()],
    'classifier__max_features': [1,2,3],
    'classifier__max_depth': [3,4,5],
    'classifier__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']
}

KNR_params = {
    'reduce_dim': [SelectKBest(k=[4,5,6]),PCA()],
    'regresor': [KNeighborsRegressor()],
    'regresor__n_neighbors': [5,10,15,20,30,50,60,75,100,150]
}

RFR_params = {
    'reduce_dim': [SelectKBest(k=[4,5,6]),PCA()],
    'regresor': [RandomForestRegressor()],
    'regresor__max_features': [1,2,3,4],
    'regresor__max_depth': [1,2,3,4,5],
    'regresor__n_estimators': [50,100,150,200,250],
    'regresor__criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
}

SVR_param = {
    'reduce_dim': [SelectKBest(k=[4,5,6]),PCA()],
    'regresor': [SVR()],
    'regresor__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'regresor__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
}

GBR_param = {
    'reduce_dim': [SelectKBest(k=[4,5,6]),PCA()],
    'regresor': [GradientBoostingRegressor()],
    'regresor__n_estimators': [50,100,150,200,250],
    'regresor__max_depth': [1,2,3,4],
    'regresor__criterion':['friedman_mse', 'squared_error'],
    'regresor__learning_rate': [0.1,0.2,0.3,0.5,0.7,0.9],
}

search_space = [
    DTR_param,
    KNR_params,
    RFR_params,
    SVR_param,
    GBR_param
]

mejor_regresor = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 5,
                  n_jobs=1)

mejor_regresor.fit(X_train, y_train)

In [50]:
print(mejor_regresor.best_estimator_)
print(mejor_regresor.best_score_)
print(mejor_regresor.best_params_)

Pipeline(steps=[('scaler', StandardScaler()), ('reduce_dim', PCA()),
                ('regresor', GradientBoostingRegressor(n_estimators=200))])
0.8121303127803058
{'reduce_dim': PCA(), 'regresor': GradientBoostingRegressor(), 'regresor__criterion': 'friedman_mse', 'regresor__learning_rate': 0.1, 'regresor__max_depth': 3, 'regresor__n_estimators': 200}


In [15]:
mejor_regresor.best_params_

{'reduce_dim': PCA(),
 'regresor': GradientBoostingRegressor(),
 'regresor__criterion': 'friedman_mse',
 'regresor__learning_rate': 0.1,
 'regresor__max_depth': 4,
 'regresor__n_estimators': 100}

### Entrenamiento de cada regresor

#### GradientBoostingRegressor

En este caso, como el mejor regresor ha sido el GradientBoosting, éste será entrenado con toda mi dataframe completa para predecir nuevos inputs.

Busqueda de mejores parámetros y entrenamiento del modelo

In [109]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])



GBR_param = {
    'scaler': [StandardScaler(),None],
    'reduce_dim': [SelectKBest(k=[5,6]),PCA(n_components=5),PCA(n_components=6)],
    'regresor': [GradientBoostingRegressor()],
    'regresor__n_estimators': [200],
    'regresor__max_depth': [4],
    'regresor__criterion':['friedman_mse'],
    'regresor__learning_rate': [0.1],
}

search_space = [
 
    GBR_param
]

Modelo_GBR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_GBR.fit(X_train, y_train)

In [110]:
print(Modelo_GBR.best_estimator_)
print(Modelo_GBR.best_score_)
print(Modelo_GBR.best_params_)

Pipeline(steps=[('scaler', StandardScaler()),
                ('reduce_dim', PCA(n_components=6)),
                ('regresor',
                 GradientBoostingRegressor(max_depth=4, n_estimators=200))])
-187.36137727046287
{'reduce_dim': PCA(n_components=6), 'regresor': GradientBoostingRegressor(), 'regresor__criterion': 'friedman_mse', 'regresor__learning_rate': 0.1, 'regresor__max_depth': 4, 'regresor__n_estimators': 200, 'scaler': StandardScaler()}


Evaluación del modelo

In [111]:
prediction = Modelo_GBR.best_estimator_.predict(X_test)

In [112]:
print('MAE:', mean_absolute_error(y_test,prediction))
print('MAPE:', mean_absolute_percentage_error(y_test,prediction))
print('RMSE:', np.sqrt(mean_squared_error(y_test,prediction)))


MAE: 163.93122522278736
MAPE: 0.14862642684096294
RMSE: 251.14244883223728


Exportación del modelo

In [56]:
modelo_GBR = Modelo_GBR.best_estimator_

In [57]:
directorio_actual

'c:\\Carpeta bootcamp\\proyecto_ML_alquiler\\notebooks'

In [58]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_GBR.pkl'), 'wb') as file:
    pickle.dump(modelo_GBR, file)

In [59]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_GBR.yaml'), 'w') as file:
    yaml.dump(Modelo_GBR.best_params_, file, default_flow_style=False)

Entrenamiento completo

In [61]:
modelo_final_GBR = Modelo_GBR.best_estimator_.fit(X, y)

Exportación del modelo definitivo

In [62]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_GBR.pkl'), 'wb') as file:
    pickle.dump(modelo_final_GBR, file)

#### DecisionTreeRegressor

Busqueda de mejores parámetros y entrenamiento del modelo

In [120]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])



DTR_param = {
    'regresor': [DecisionTreeRegressor()],
    'regresor__max_features': [1,2,3],
    'regresor__max_depth': [3,4,5],
    'regresor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']
}

search_space = [
 
    DTR_param
]

Modelo_DTR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_DTR.fit(X_train, y_train)

Evaluación del modelo

In [121]:
prediction_DTR = Modelo_DTR.best_estimator_.predict(X_test)
print('MAE:', mean_absolute_error(y_test,prediction_DTR))
print('MAPE:', mean_absolute_percentage_error(y_test,prediction_DTR))
print('RMSE:', np.sqrt(mean_squared_error(y_test,prediction_DTR)))

MAE: 225.24161073825502
MAPE: 0.20052422825561178
RMSE: 335.2385807753464


Exportación del modelo

In [122]:
modelo_final_DTR = Modelo_DTR.best_estimator_

In [123]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_DTR.pkl'), 'wb') as file:
    pickle.dump(modelo_final_DTR, file)

In [124]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_DTR.yaml'), 'w') as file:
    yaml.dump(Modelo_DTR.best_params_, file, default_flow_style=False)

#### SVR

Busqueda de mejores parámetros y entrenamiento del modelo

In [68]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])



SVR_param = {
    'reduce_dim': [SelectKBest(k=[2,3,4,5]),PCA()],
    'regresor': [SVR()],
    'regresor__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'regresor__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
}

search_space = [
 
    logistic_params
]

Modelo_SVR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_SVR.fit(X_train, y_train)

Evaluación del modelo

In [69]:
prediction_SVR = Modelo_SVR.best_estimator_.predict(X_test)
print('MAE:', mean_absolute_error(y_test,prediction_SVR))
print('MAPE:', mean_absolute_percentage_error(y_test,prediction_SVR))
print('RMSE:', np.sqrt(mean_squared_error(y_test,prediction_SVR)))

MAE: 259.80536912751677
MAPE: 0.22090038893453012
RMSE: 433.7622288085255


Exportación del modelo

In [70]:
modelo_final_SVR = Modelo_SVR.best_estimator_

In [71]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_SVR.pkl'), 'wb') as file:
    pickle.dump(modelo_final_SVR, file)

In [72]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_SVR.yaml'), 'w') as file:
    yaml.dump(Modelo_SVR.best_params_, file, default_flow_style=False)

#### KNeighborsRegressor

Busqueda de mejores parámetros y entrenamiento del modelo

In [73]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])


KNR_params = {
    'reduce_dim': [SelectKBest(k=[2,3,4,5]),PCA()],
    'regresor': [KNeighborsRegressor()],
    'regresor__n_neighbors': [5,10,15,20,30,50,60,75,100,150]
}

search_space = [
 
    KNR_params
]

Modelo_KNR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_KNR.fit(X_train, y_train)

Evaluación del modelo

In [74]:
prediction_KNR = Modelo_KNR.best_estimator_.predict(X_test)
print('MAE:', mean_absolute_error(y_test,prediction_KNR))
print('MAPE:', mean_absolute_percentage_error(y_test,prediction_KNR))
print('RMSE:', np.sqrt(mean_squared_error(y_test,prediction_KNR)))

MAE: 191.52080536912752
MAPE: 0.17100743131978918
RMSE: 282.09470112478994


Exportación del modelo

In [75]:
modelo_final_KNR = Modelo_KNR.best_estimator_

In [76]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_KNR.pkl'), 'wb') as file:
    pickle.dump(modelo_final_KNR, file)

In [77]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_KNR.yaml'), 'w') as file:
    yaml.dump(Modelo_KNR.best_params_, file, default_flow_style=False)

#### RandomForestRegressor

Busqueda de mejores parámetros y entrenamiento del modelo

In [79]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=6)),
                 ('regresor', RandomForestRegressor())])


RFR_params = {
    'reduce_dim': [SelectKBest(k=[5,6]),PCA()],
    'regresor': [RandomForestRegressor()],
    'regresor__max_features': [1,2],
    'regresor__max_depth': [3,4],
    'regresor__n_estimators': [50,100],
    'regresor__criterion': ['squared_error', 'absolute_error']
}

search_space = [
 
    RFR_params
]

Modelo_RFR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 5,
                  n_jobs=1)

Modelo_RFR.fit(X_train, y_train)

Evaluación del modelo

In [80]:
prediction_RFR = Modelo_RFR.best_estimator_.predict(X_test)
print('MAE:', mean_absolute_error(y_test,prediction_RFR))
print('MAPE:', mean_absolute_percentage_error(y_test,prediction_RFR))
print('RMSE:', np.sqrt(mean_squared_error(y_test,prediction_RFR)))

MAE: 203.00422818791947
MAPE: 0.19281889895706295
RMSE: 303.86388670967256


Exportación del modelo

In [82]:
modelo_final_RFR = Modelo_RFR.best_estimator_

In [83]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_RFR.pkl'), 'wb') as file:
    pickle.dump(modelo_final_RFR, file)

In [84]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_RFR.yaml'), 'w') as file:
    yaml.dump(Modelo_RFR.best_params_, file, default_flow_style=False)

### Análisis de predicciones

In [85]:
prediction = Modelo_GBR.best_estimator_.predict(X)

In [86]:
analisis_prediction = pd.DataFrame({'test_real': y, 'prediccion': prediction})

In [87]:
analisis_prediction['error'] = analisis_prediction['test_real'] - analisis_prediction['prediccion']
analisis_prediction.sort_values(by='error', ascending=False)

Unnamed: 0,test_real,prediccion,error
919,2650.0,1685.049607,964.950393
958,1600.0,1005.288756,594.711244
923,2700.0,2255.046285,444.953715
912,2000.0,1601.535519,398.464481
947,1500.0,1102.279371,397.720629
...,...,...,...
836,700.0,962.107512,-262.107512
346,900.0,1168.880771,-268.880771
500,900.0,1172.229946,-272.229946
775,950.0,1236.683515,-286.683515
