## Entrenamiento del modelo

### Importación de librerías

Datos y gráficos

In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import yaml
import os

Preprocesado y modelado

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

In [17]:
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

### Warnings y display de la dataframe

In [18]:
pd.set_option("max_colwidth", 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import warnings
warnings.filterwarnings("ignore")

### Importación de dataframes

In [19]:
directorio_actual = os.getcwd()

In [20]:
'''Creo las X_train e y_train a partir de df_train.
   Creo las X_test e y_test a partir de df_test.
   Ambas previamente importado
   Por ultimo, creo las X e y con todo el train y test,
   para entrenar mimodelo final y generalizar mejor ante nuevos input.'''

df_train_venta = pd.read_csv(os.path.join(directorio_actual, '..', 'data','train', 'df_train_venta.csv'))
df_test_venta = pd.read_csv(os.path.join(directorio_actual, '..', 'data','test', 'df_test_venta.csv'))
df_num_venta = pd.read_csv(os.path.join(directorio_actual, '..', 'data','processed', 'df_num_venta.csv'))

X_train_venta = df_train_venta.drop(['price'],axis=1)
y_train_venta = df_train_venta['price']
X_test_venta = df_test_venta.drop(['price'],axis=1)
y_test_venta = df_test_venta['price']
X_venta = df_num_venta.drop(['price'],axis=1)
y_venta = df_num_venta['price']

### Busqueda de regresor

Primero busco el regresor mas adecuado para predecir mi target

In [21]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=6)),
                 ('regresor', RandomForestRegressor())])

DTR_param = {
    'classifier': [DecisionTreeRegressor()],
    'classifier__max_features': [1,2,3],
    'classifier__max_depth': [3,4,5],
    'classifier__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']
}

KNR_params = {
    'reduce_dim': [SelectKBest(k=[4,5,6]),PCA()],
    'regresor': [KNeighborsRegressor()],
    'regresor__n_neighbors': [5,10,15,20,30,50,60,75,100,150]
}

RFR_params = {
    'reduce_dim': [SelectKBest(k=[4,5,6]),PCA()],
    'regresor': [RandomForestRegressor()],
    'regresor__max_features': [1,2,3,4],
    'regresor__max_depth': [1,2,3,4,5],
    'regresor__n_estimators': [50,100,150,200,250],
    'regresor__criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
}

SVR_param = {
    'reduce_dim': [SelectKBest(k=[4,5,6]),PCA()],
    'regresor': [SVR()],
    'regresor__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'regresor__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
}

GBR_param = {
    'reduce_dim': [SelectKBest(k=[4,5,6]),PCA()],
    'regresor': [GradientBoostingRegressor()],
    'regresor__n_estimators': [50,100,150,200,250],
    'regresor__max_depth': [1,2,3,4],
    'regresor__criterion':['friedman_mse', 'squared_error'],
    'regresor__learning_rate': [0.1,0.2,0.3,0.5,0.7,0.9],
}

search_space = [
    DTR_param,
    KNR_params,
    RFR_params,
    SVR_param,
    GBR_param
]

mejor_regresor = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 5,
                  n_jobs=1)

mejor_regresor.fit(X_train_venta, y_train_venta)

'pipe = Pipeline(steps=\n                [("scaler", StandardScaler()),\n                 ("reduce_dim", SelectKBest(k=6)),\n                 (\'regresor\', RandomForestRegressor())])\n\nDTR_param = {\n    \'classifier\': [DecisionTreeRegressor()],\n    \'classifier__max_features\': [1,2,3],\n    \'classifier__max_depth\': [3,4,5],\n    \'classifier__criterion\': [\'squared_error\', \'friedman_mse\', \'absolute_error\', \'poisson\']\n}\n\nKNR_params = {\n    \'reduce_dim\': [SelectKBest(k=[4,5,6]),PCA()],\n    \'regresor\': [KNeighborsRegressor()],\n    \'regresor__n_neighbors\': [5,10,15,20,30,50,60,75,100,150]\n}\n\nRFR_params = {\n    \'reduce_dim\': [SelectKBest(k=[4,5,6]),PCA()],\n    \'regresor\': [RandomForestRegressor()],\n    \'regresor__max_features\': [1,2,3,4],\n    \'regresor__max_depth\': [1,2,3,4,5],\n    \'regresor__n_estimators\': [50,100,150,200,250],\n    \'regresor__criterion\': [\'squared_error\', \'absolute_error\', \'friedman_mse\', \'poisson\']\n}\n\nSVR_param =

In [22]:
# print(mejor_regresor.best_estimator_)
# print(mejor_regresor.best_score_)
# print(mejor_regresor.best_params_)

In [23]:
#mejor_regresor.best_params_

### Entrenamiento de cada regresor

#### GradientBoostingRegressor

En este caso, como el mejor regresor ha sido el GradientBoosting, éste será entrenado con toda mi dataframe completa para predecir nuevos inputs.

Busqueda de mejores parámetros y entrenamiento del modelo

In [24]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])



GBR_param = {
    'scaler': [StandardScaler(),None],
    'reduce_dim': [SelectKBest(k=[5,6]),PCA(n_components=5),PCA(n_components=6)],
    'regresor': [GradientBoostingRegressor()],
    'regresor__n_estimators': [200],
    'regresor__max_depth': [4],
    'regresor__criterion':['friedman_mse'],
    'regresor__learning_rate': [0.1],
}

search_space = [
 
    GBR_param
]

Modelo_GBR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_GBR.fit(X_train_venta, y_train_venta)

In [25]:
print(Modelo_GBR.best_estimator_)
print(Modelo_GBR.best_score_)
print(Modelo_GBR.best_params_)

Pipeline(steps=[('scaler', None), ('reduce_dim', PCA(n_components=6)),
                ('regresor',
                 GradientBoostingRegressor(max_depth=4, n_estimators=200))])
-60294.7655617923
{'reduce_dim': PCA(n_components=6), 'regresor': GradientBoostingRegressor(), 'regresor__criterion': 'friedman_mse', 'regresor__learning_rate': 0.1, 'regresor__max_depth': 4, 'regresor__n_estimators': 200, 'scaler': None}


Evaluación del modelo

In [26]:
prediction = Modelo_GBR.best_estimator_.predict(X_test_venta)

In [27]:
print('MAE:', mean_absolute_error(y_test_venta,prediction))
print('MAPE:', mean_absolute_percentage_error(y_test_venta,prediction))
print('RMSE:', np.sqrt(mean_squared_error(y_test_venta,prediction)))


MAE: 52323.585900074104
MAPE: 0.21782115597814725
RMSE: 112136.03486292533


Exportación del modelo

In [28]:
modelo_GBR = Modelo_GBR.best_estimator_

In [29]:
directorio_actual

'c:\\Carpeta bootcamp\\proyecto_ML_alquiler\\notebooks'

In [30]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_GBR_venta.pkl'), 'wb') as file:
    pickle.dump(modelo_GBR, file)

In [31]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_GBR_venta.yaml'), 'w') as file:
    yaml.dump(Modelo_GBR.best_params_, file, default_flow_style=False)

Entrenamiento completo

In [32]:
modelo_final_GBR = Modelo_GBR.best_estimator_.fit(X_venta, y_venta)

Exportación del modelo definitivo

In [33]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_GBR_venta.pkl'), 'wb') as file:
    pickle.dump(modelo_final_GBR, file)

#### DecisionTreeRegressor

Busqueda de mejores parámetros y entrenamiento del modelo

In [34]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])



DTR_param = {
    'regresor': [DecisionTreeRegressor()],
    'regresor__max_features': [1,2,3],
    'regresor__max_depth': [3,4,5],
    'regresor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']
}

search_space = [
 
    DTR_param
]

Modelo_DTR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_DTR.fit(X_train_venta, y_train_venta)

Evaluación del modelo

In [35]:
prediction_DTR = Modelo_DTR.best_estimator_.predict(X_test_venta)
print('MAE:', mean_absolute_error(y_test_venta,prediction_DTR))
print('MAPE:', mean_absolute_percentage_error(y_test_venta,prediction_DTR))
print('RMSE:', np.sqrt(mean_squared_error(y_test_venta,prediction_DTR)))

MAE: 72566.66431914599
MAPE: 0.2859288192249537
RMSE: 171864.57534737847


Exportación del modelo

In [36]:
modelo_final_DTR = Modelo_DTR.best_estimator_

In [37]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_DTR_venta.pkl'), 'wb') as file:
    pickle.dump(modelo_final_DTR, file)

In [38]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_DTR_venta.yaml'), 'w') as file:
    yaml.dump(Modelo_DTR.best_params_, file, default_flow_style=False)

#### SVR

Busqueda de mejores parámetros y entrenamiento del modelo

In [39]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])



SVR_param = {
    'reduce_dim': [SelectKBest(k=[2,3,4,5]),PCA()],
    'regresor': [SVR()],
    'regresor__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'regresor__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
}

search_space = [
 
    SVR_param
]

Modelo_SVR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_SVR.fit(X_train_venta, y_train_venta)

Evaluación del modelo

In [40]:
prediction_SVR = Modelo_SVR.best_estimator_.predict(X_test_venta)
print('MAE:', mean_absolute_error(y_test_venta,prediction_SVR))
print('MAPE:', mean_absolute_percentage_error(y_test_venta,prediction_SVR))
print('RMSE:', np.sqrt(mean_squared_error(y_test_venta,prediction_SVR)))

MAE: 82393.53935809557
MAPE: 0.31604771322795877
RMSE: 196879.17622953467


Exportación del modelo

In [41]:
modelo_final_SVR = Modelo_SVR.best_estimator_

In [42]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_SVR_venta.pkl'), 'wb') as file:
    pickle.dump(modelo_final_SVR, file)

In [43]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_SVR_venta.yaml'), 'w') as file:
    yaml.dump(Modelo_SVR.best_params_, file, default_flow_style=False)

#### KNeighborsRegressor

Busqueda de mejores parámetros y entrenamiento del modelo

In [44]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])


KNR_params = {
    'reduce_dim': [SelectKBest(k=[2,3,4,5]),PCA()],
    'regresor': [KNeighborsRegressor()],
    'regresor__n_neighbors': [5,10,15,20,30,50,60,75,100,150]
}

search_space = [
 
    KNR_params
]

Modelo_KNR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_KNR.fit(X_train_venta, y_train_venta)

Evaluación del modelo

In [45]:
prediction_KNR = Modelo_KNR.best_estimator_.predict(X_test_venta)
print('MAE:', mean_absolute_error(y_test_venta,prediction_KNR))
print('MAPE:', mean_absolute_percentage_error(y_test_venta,prediction_KNR))
print('RMSE:', np.sqrt(mean_squared_error(y_test_venta,prediction_KNR)))

MAE: 61442.86339285714
MAPE: 0.23737646069520055
RMSE: 136764.5937079183


Exportación del modelo

In [46]:
modelo_final_KNR = Modelo_KNR.best_estimator_

In [47]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_KNR_venta.pkl'), 'wb') as file:
    pickle.dump(modelo_final_KNR, file)

In [48]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_KNR_venta.yaml'), 'w') as file:
    yaml.dump(Modelo_KNR.best_params_, file, default_flow_style=False)

#### RandomForestRegressor

Busqueda de mejores parámetros y entrenamiento del modelo

In [49]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=6)),
                 ('regresor', RandomForestRegressor())])


RFR_params = {
    'reduce_dim': [SelectKBest(k=[5,6]),PCA()],
    'regresor': [RandomForestRegressor()],
    'regresor__max_features': [1,2],
    'regresor__max_depth': [3,4],
    'regresor__n_estimators': [50,100],
    'regresor__criterion': ['squared_error', 'absolute_error']
}

search_space = [
 
    RFR_params
]

Modelo_RFR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 5,
                  n_jobs=1)

Modelo_RFR.fit(X_train_venta, y_train_venta)

Evaluación del modelo

In [50]:
prediction_RFR = Modelo_RFR.best_estimator_.predict(X_test_venta)
print('MAE:', mean_absolute_error(y_test_venta,prediction_RFR))
print('MAPE:', mean_absolute_percentage_error(y_test_venta,prediction_RFR))
print('RMSE:', np.sqrt(mean_squared_error(y_test_venta,prediction_RFR)))

MAE: 71844.23154017857
MAPE: 0.29537842493304745
RMSE: 162976.84629534456


Exportación del modelo

In [51]:
modelo_final_RFR = Modelo_RFR.best_estimator_

In [52]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_RFR_venta.pkl'), 'wb') as file:
    pickle.dump(modelo_final_RFR, file)

In [53]:
with open(os.path.join(directorio_actual, '..', 'models','modelo_final_RFR_venta.yaml'), 'w') as file:
    yaml.dump(Modelo_RFR.best_params_, file, default_flow_style=False)