### Entrenamiento del modelo

Librerías datos y gráficos

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle

Preprocesado y modelado

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression

In [6]:
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

In [7]:
pd.set_option("max_colwidth", 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import warnings
warnings.filterwarnings("ignore")

In [8]:
'''Creo las X_train e y_train a partir de df_train.
   Creo las X_test e y_test a partir de df_test.
   Ambas previamente importado
   Por ultimo, creo las X e y con todo el train y test,
   para entrenar mimodelo final y generalizar mejor ante nuevos input.'''

df_train = pd.read_csv('data/df_train.csv')
df_test = pd.read_csv('data/df_test.csv')
df_num = pd.read_csv('data/df_num.csv')

X_train = df_train.drop(['price'],axis=1)
y_train = df_train['price']
X_test = df_test.drop(['price'],axis=1)
y_test = df_test['price']
X = df_num.drop(['price'],axis=1)
y = df_num['price']

#### Primero intento buscar el regresor mas adecuado para predecir la target

In [12]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])

logistic_params = {
    'regresor': [LogisticRegression(max_iter=1000, solver='liblinear')],
    'regresor__penalty': ['l1', 'l2']
}

KNR_params = {
    'reduce_dim': [SelectKBest(k=[2,3,4,5]),PCA()],
    'regresor': [KNeighborsRegressor()],
    'regresor__n_neighbors': [5,10,15,20,30,50,60,75,100,150]
}

RFR_params = {
    'reduce_dim': [SelectKBest(k=[2,3,4,5]),PCA()],
    'regresor': [RandomForestRegressor()],
    'regresor__max_features': [1,2,3,4],
    'regresor__max_depth': [1,2,3,4,5],
    'regresor__n_estimators': [50,100,150,200,250],
    'regresor__criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
}

SVR_param = {
    'reduce_dim': [SelectKBest(k=[2,3,4,5]),PCA()],
    'regresor': [SVR()],
    'regresor__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'regresor__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
}

GBR_param = {
    'reduce_dim': [SelectKBest(k=[2,3,4,5]),PCA()],
    'regresor': [GradientBoostingRegressor()],
    'regresor__n_estimators': [50,100,150,200,250],
    'regresor__max_depth': [1,2,3,4],
    'regresor__criterion':['friedman_mse', 'squared_error'],
    'regresor__learning_rate': [0.1,0.2,0.3,0.5,0.7,0.9],
}

search_space = [
    logistic_params,
    KNR_params,
    RFR_params,
    SVR_param,
    GBR_param
]

mejor_regresor = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 5)

mejor_regresor.fit(X_train, y_train)

In [13]:
print(mejor_regresor.best_estimator_)
print(mejor_regresor.best_score_)
print(mejor_regresor.best_params_)

Pipeline(steps=[('scaler', StandardScaler()), ('reduce_dim', PCA()),
                ('regresor', GradientBoostingRegressor(max_depth=4))])
0.8134727232173526
{'reduce_dim': PCA(), 'regresor': GradientBoostingRegressor(), 'regresor__criterion': 'friedman_mse', 'regresor__learning_rate': 0.1, 'regresor__max_depth': 4, 'regresor__n_estimators': 100}


In [15]:
mejor_regresor.best_params_

{'reduce_dim': PCA(),
 'regresor': GradientBoostingRegressor(),
 'regresor__criterion': 'friedman_mse',
 'regresor__learning_rate': 0.1,
 'regresor__max_depth': 4,
 'regresor__n_estimators': 100}

### Pese a que el mejor regresor es el GradientBoostingRegressor, entrenamos los 5 regresores por separado para comprobar el error que da cada uno.

#### GradientBoostingRegressor

In [19]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])



GBR_param = {
    'scaler': [StandardScaler(),None],
    'reduce_dim': [SelectKBest(k=[2,3,4,5]),PCA()],
    'regresor': [GradientBoostingRegressor()],
    'regresor__n_estimators': [50,100,150],
    'regresor__max_depth': [3,4,5,],
    'regresor__criterion':['friedman_mse', 'squared_error'],
    'regresor__learning_rate': [0.1,0.2,0.3],
}

search_space = [
 
    GBR_param
]

Modelo_GBR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_GBR.fit(X_train, y_train)

In [20]:
print(Modelo_GBR.best_estimator_)
print(Modelo_GBR.best_score_)
print(Modelo_GBR.best_params_)

Pipeline(steps=[('scaler', StandardScaler()), ('reduce_dim', PCA()),
                ('regresor', GradientBoostingRegressor(max_depth=5))])
-184.43977350909253
{'reduce_dim': PCA(), 'regresor': GradientBoostingRegressor(), 'regresor__criterion': 'friedman_mse', 'regresor__learning_rate': 0.1, 'regresor__max_depth': 5, 'regresor__n_estimators': 100, 'scaler': StandardScaler()}


In [21]:
prediction = Modelo_GBR.best_estimator_.predict(X_test)

In [23]:
print('MAE:', mean_absolute_error(y_test,prediction))
print('MAPE:', mean_absolute_percentage_error(y_test,prediction))
print('RMSE:', np.sqrt(mean_squared_error(y_test,prediction)))


MAE: 174.4490430590078
MAPE: 0.16294621775087026
RMSE: 255.77041021805033


#### LogisticRegression

In [51]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])



logistic_params = {
    'regresor': [LogisticRegression(max_iter=1000, solver='liblinear')],
    'regresor__penalty': ['l1', 'l2']
}

search_space = [
 
    logistic_params
]

Modelo_logistic = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_logistic.fit(X_train, y_train)

In [52]:
prediction_logistic = Modelo_logistic.best_estimator_.predict(X_test)
print('MAE:', mean_absolute_error(y_test,prediction_logistic))
print('MAPE:', mean_absolute_percentage_error(y_test,prediction_logistic))
print('RMSE:', np.sqrt(mean_squared_error(y_test,prediction_logistic)))

MAE: 266.5838926174497
MAPE: 0.22093857232591949
RMSE: 470.4670747845973


In [53]:
modelo_final_losgistic = Modelo_logistic.best_estimator_

In [54]:
with open('models/modelo_final_logistic.pkl', 'wb') as file:
    pickle.dump(modelo_final_losgistic, file)

#### SVR

In [47]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])



SVR_param = {
    'reduce_dim': [SelectKBest(k=[2,3,4,5]),PCA()],
    'regresor': [SVR()],
    'regresor__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'regresor__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
}

search_space = [
 
    logistic_params
]

Modelo_SVR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_SVR.fit(X_train, y_train)

In [48]:
prediction_SVR = Modelo_SVR.best_estimator_.predict(X_test)
print('MAE:', mean_absolute_error(y_test,prediction_SVR))
print('MAPE:', mean_absolute_percentage_error(y_test,prediction_SVR))
print('RMSE:', np.sqrt(mean_squared_error(y_test,prediction_SVR)))

MAE: 266.5838926174497
MAPE: 0.22093857232591949
RMSE: 470.4670747845973


In [49]:
modelo_final_SVR = Modelo_SVR.best_estimator_

In [50]:
with open('models/modelo_final_SVR.pkl', 'wb') as file:
    pickle.dump(modelo_final_SVR, file)

#### KNeighborsRegressor

In [43]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])


KNR_params = {
    'reduce_dim': [SelectKBest(k=[2,3,4,5]),PCA()],
    'regresor': [KNeighborsRegressor()],
    'regresor__n_neighbors': [5,10,15,20,30,50,60,75,100,150]
}

search_space = [
 
    KNR_params
]

Modelo_KNR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10)

Modelo_KNR.fit(X_train, y_train)

In [44]:
prediction_KNR = Modelo_KNR.best_estimator_.predict(X_test)
print('MAE:', mean_absolute_error(y_test,prediction_KNR))
print('MAPE:', mean_absolute_percentage_error(y_test,prediction_KNR))
print('RMSE:', np.sqrt(mean_squared_error(y_test,prediction_KNR)))

MAE: 189.97181208053692
MAPE: 0.1673281127466056
RMSE: 285.93736348880003


In [45]:
modelo_final_KNR = Modelo_KNR.best_estimator_

In [46]:
with open('models/modelo_final_KNR.pkl', 'wb') as file:
    pickle.dump(modelo_final_KNR, file)

#### RandomForestRegressor

In [39]:
pipe = Pipeline(steps=
                [("scaler", StandardScaler()),
                 ("reduce_dim", SelectKBest(k=5)),
                 ('regresor', RandomForestRegressor())])


RFR_params = {
    'reduce_dim': [SelectKBest(k=[4,5]),PCA()],
    'regresor': [RandomForestRegressor()],
    'regresor__max_features': [1,2,3,4],
    'regresor__max_depth': [2,3,4],
    'regresor__n_estimators': [50,100,150],
    'regresor__criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
}

search_space = [
 
    RFR_params
]

Modelo_RFR = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 5,
                  n_jobs=1)

Modelo_RFR.fit(X_train, y_train)

In [40]:
prediction_RFR = Modelo_RFR.best_estimator_.predict(X_test)
print('MAE:', mean_absolute_error(y_test,prediction_RFR))
print('MAPE:', mean_absolute_percentage_error(y_test,prediction_RFR))
print('RMSE:', np.sqrt(mean_squared_error(y_test,prediction_RFR)))

MAE: 198.15097315436245
MAPE: 0.18648733707086287
RMSE: 278.18867015001206


In [41]:
modelo_final_RFR = Modelo_RFR.best_estimator_

In [42]:
with open('models/modelo_final_RFR.pkl', 'wb') as file:
    pickle.dump(modelo_final_RFR, file)

El mejor posible es el GradientBoostingRegressor. A continuacion, lo entreno con 'X' e 'y' completas para entrenar con todos los datos y generalizar mejor ante nuevos input.

In [24]:
Modelo_GBR.best_estimator_.fit(X, y)

In [25]:
print(Modelo_GBR.best_estimator_)
print(Modelo_GBR.best_score_)
print(Modelo_GBR.best_params_)

Pipeline(steps=[('scaler', StandardScaler()), ('reduce_dim', PCA()),
                ('regresor', GradientBoostingRegressor(max_depth=5))])
-184.43977350909253
{'reduce_dim': PCA(), 'regresor': GradientBoostingRegressor(), 'regresor__criterion': 'friedman_mse', 'regresor__learning_rate': 0.1, 'regresor__max_depth': 5, 'regresor__n_estimators': 100, 'scaler': StandardScaler()}


In [None]:
modelo_final_GBR = Modelo_GBR.best_estimator_

In [None]:
with open('models/modelo_final_GBR.pkl', 'wb') as file:
    pickle.dump(modelo_final_GBR, file)

Predicciones

In [48]:
prediction = Modelo_GBR.best_estimator_.predict(X)

In [51]:
analisis_prediction = pd.DataFrame({'test_real': y, 'prediccion': prediction})

In [52]:
analisis_prediction['error'] = analisis_prediction['test_real'] - analisis_prediction['prediccion']
analisis_prediction.sort_values(by='error', ascending=False)

Unnamed: 0,test_real,prediccion,error
934,2500.0,1320.007817,1179.992183
919,2650.0,1540.330684,1109.669316
923,2700.0,1801.563848,898.436152
232,2900.0,2119.785658,780.214342
674,2400.0,1628.821234,771.178766
...,...,...,...
965,1900.0,2437.249422,-537.249422
97,1400.0,1941.547830,-541.547830
539,1200.0,1753.403394,-553.403394
346,900.0,1455.707245,-555.707245


Ejemplo de llamada al modelo

In [15]:
dicc_miraflores = {'size':150,'codigo_distrito':48,'parking':1,'codigo_tipo':5 ,'total_rooms':3}
dicc_centro = {'size':150,'codigo_distrito':69,'parking':1,'codigo_tipo':7 ,'total_rooms':6}

In [16]:
centro = pd.DataFrame(data=dicc_centro, index=[0])
miraflores = pd.DataFrame(data=dicc_miraflores, index=[0])

In [55]:
prediction_centro = Modelo_GBR.best_estimator_.predict(centro)
prediction_centro

array([1992.71154652])

In [56]:
prediction_miraflores = Modelo_GBR.best_estimator_.predict(miraflores)
prediction_miraflores

array([1534.73565188])

In [26]:
modelo_final_GBR = Modelo_GBR.best_estimator_

Guardar modelo

In [None]:

with open('models/modelo_final_RFR.pkl', 'wb') as file:
    pickle.dump(modelo_final_GBR, file)

In [None]:
'''para guardar el modelo entrenado'''
with open('models/modelo_final_GBR.pkl', 'wb') as file:
    pickle.dump(modelo_final_GBR, file)

In [None]:
'''para guardar el modelo entrenado'''
with open('models/modelo_final_GBR.pkl', 'wb') as file:
    pickle.dump(modelo_final_GBR, file)

In [None]:
'''para guardar el modelo entrenado'''
with open('models/modelo_final_GBR.pkl', 'wb') as file:
    pickle.dump(modelo_final_GBR, file)