### Entrenamiento del modelo

Librerías datos y gráficos

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

Preprocesado y modelado

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [30]:
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

In [4]:
pd.set_option("max_colwidth", 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import warnings
warnings.filterwarnings("ignore")

In [45]:
'''Creo las X_train e y_train a partir de df_train.
   Creo las X_test e y_test a partir de df_test.
   Ambas previamente importado
   Por ultimo, creo las X e y con todo el train y test,
   para entrenar mimodelo final y generalizar mejor ante nuevos input.'''

df_train = pd.read_csv('data/df_train.csv')
df_test = pd.read_csv('data/df_test.csv')
df_num = pd.read_csv('data/df_num.csv')

X_train = df_train.drop(['price'],axis=1)
y_train = df_train['price']
X_test = df_test.drop(['price'],axis=1)
y_test = df_test['price']
X = df_num.drop(['price'],axis=1)
y = df_num['price']

#### Primero intento buscar el regresor mas adecuado para predecir la target

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [32]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("selectkbest", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])

KNR_params = {
    'regresor': [KNeighborsRegressor()],
    'regresor__n_neighbors': [5,10,15,20,30,50,60,75,100,150]
}

RFR_params = {
    'regresor': [RandomForestRegressor()],
    'regresor__max_features': [1,2,3,4],
    'regresor__max_depth': [1,2,3,4,5],
    'regresor__n_estimators': [50,100,150,200,250],
    'regresor__criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
}

SVR_param = {
    'regresor': [SVR()],
    'regresor__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'regresor__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
}

GBR_param = {
    'regresor': [GradientBoostingRegressor()],
    'regresor__n_estimators': [50,100,150,200,250],
    'regresor__max_depth': [1,2,3,4],
    'regresor__criterion':['friedman_mse', 'squared_error'],
    'regresor__learning_rate': [0.1,0.2,0.3,0.5,0.7,0.9],
}

search_space = [
    KNR_params,
    RFR_params,
    SVR_param,
    GBR_param
]

mejor_regresor = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 5)

mejor_regresor.fit(X_train, y_train)

In [33]:
print(mejor_regresor.best_estimator_)
print(mejor_regresor.best_score_)
print(mejor_regresor.best_params_)

Pipeline(steps=[('scaler', StandardScaler()), ('selectkbest', SelectKBest(k=5)),
                ('regresor',
                 GradientBoostingRegressor(criterion='squared_error',
                                           learning_rate=0.2,
                                           n_estimators=50))])
0.8017719635165387
{'regresor': GradientBoostingRegressor(), 'regresor__criterion': 'squared_error', 'regresor__learning_rate': 0.2, 'regresor__max_depth': 3, 'regresor__n_estimators': 50}


Una vez encontrado, que el mejor regresor para mi proyecto es el GradientBoostingRegressor, según los mejores parámetros obtenidos, vuelvo a entrenarlo intentando afinar un poco mas los parámetros.

In [41]:
pipe = Pipeline(steps=[("scaler", StandardScaler()),
                       ("selectkbest", SelectKBest(k=5)),
                       ('regresor', RandomForestRegressor())
])



GBR_param = {
    'regresor': [GradientBoostingRegressor()],
    'regresor__n_estimators': [10,20,30,40,50],
    'regresor__max_depth': [1,2,3,4],
    'regresor__criterion':['friedman_mse', 'squared_error'],
    'regresor__learning_rate': [0.1,0.2,0.3,0.5],
}


search_space = GBR_param
    


Modelo_GBR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10
                  )

Modelo_GBR.fit(X_train, y_train)

In [42]:
print(Modelo_GBR.best_estimator_)
print(Modelo_GBR.best_score_)
print(Modelo_GBR.best_params_)

Pipeline(steps=[('scaler', StandardScaler()), ('selectkbest', SelectKBest(k=5)),
                ('regresor', GradientBoostingRegressor(n_estimators=50))])
-188.8225301691541
{'regresor': GradientBoostingRegressor(), 'regresor__criterion': 'friedman_mse', 'regresor__learning_rate': 0.1, 'regresor__max_depth': 3, 'regresor__n_estimators': 50}


In [43]:
prediction = Modelo_GBR.best_estimator_.predict(X_test)

In [44]:
print('MAE:', mean_absolute_error(y_test,prediction))

MAE: 176.88025726017085


Este es el mejor modelo posible. A continuacion, lo entreno con 'X' e 'y' completas para entrenar con todos los datos y generalizar mejor ante nuevos input.

In [46]:
Modelo_GBR.fit(X, y)

In [47]:
print(Modelo_GBR.best_estimator_)
print(Modelo_GBR.best_score_)
print(Modelo_GBR.best_params_)

Pipeline(steps=[('scaler', StandardScaler()), ('selectkbest', SelectKBest(k=5)),
                ('regresor',
                 GradientBoostingRegressor(criterion='squared_error',
                                           learning_rate=0.2, max_depth=4,
                                           n_estimators=30))])
-191.38252017401507
{'regresor': GradientBoostingRegressor(), 'regresor__criterion': 'squared_error', 'regresor__learning_rate': 0.2, 'regresor__max_depth': 4, 'regresor__n_estimators': 30}


In [48]:
prediction = Modelo_GBR.best_estimator_.predict(X)

In [51]:
analisis_prediction = pd.DataFrame({'test_real': y, 'prediccion': prediction})

In [52]:
analisis_prediction['error'] = analisis_prediction['test_real'] - analisis_prediction['prediccion']
analisis_prediction.sort_values(by='error', ascending=False)

Unnamed: 0,test_real,prediccion,error
934,2500.0,1320.007817,1179.992183
919,2650.0,1540.330684,1109.669316
923,2700.0,1801.563848,898.436152
232,2900.0,2119.785658,780.214342
674,2400.0,1628.821234,771.178766
...,...,...,...
965,1900.0,2437.249422,-537.249422
97,1400.0,1941.547830,-541.547830
539,1200.0,1753.403394,-553.403394
346,900.0,1455.707245,-555.707245


Ejemplo de llamada al modelo

In [15]:
dicc_miraflores = {'size':150,'codigo_distrito':48,'parking':1,'codigo_tipo':5 ,'total_rooms':3}
dicc_centro = {'size':150,'codigo_distrito':69,'parking':1,'codigo_tipo':7 ,'total_rooms':6}

In [16]:
centro = pd.DataFrame(data=dicc_centro, index=[0])
miraflores = pd.DataFrame(data=dicc_miraflores, index=[0])

In [55]:
prediction_centro = Modelo_GBR.best_estimator_.predict(centro)
prediction_centro

array([1992.71154652])

In [56]:
prediction_miraflores = Modelo_GBR.best_estimator_.predict(miraflores)
prediction_miraflores

array([1534.73565188])

In [53]:
modelo_final_GBR = Modelo_GBR.best_estimator_

Guardar modelo

In [20]:
import pickle

In [54]:
'''para guardar el modelo entrenado'''
with open('modelos_entrenados/modelo_final_GBR.pkl', 'wb') as file:
    pickle.dump(modelo_final_GBR, file)