In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
sns.set_style('whitegrid')
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error ,mean_squared_error,r2_score,max_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import  KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
import xgboost
import pickle
pd.set_option('display.float_format', lambda x: '%.2f' % x)


# Leyendo los datos de trabajo

In [3]:
df = pd.read_csv('data/processed/coches_segunda_mano_ML.csv')
df.head()

Unnamed: 0,Marca,Modelo,Precio,Tipo_Combustible,Año,kms,CV,N_Puertas,Tipo_Cambio,color,N_Fotos,Provincia,TDG
0,1,172,6200,4,2017.0,50071,82.0,2,2,3,6,42,-7.06
1,1,814,7851,3,2016.0,103000,100.0,2,2,3,10,7,-12.11
2,1,221,19426,3,2014.0,120000,140.0,2,2,3,9,33,-5.17
3,1,813,22850,3,2017.0,107000,130.0,3,2,3,4,39,-3.68
4,1,40,11490,4,2016.0,78665,130.0,2,2,3,32,29,-5.83


## Divido el DF en train y test

In [6]:
df.columns

Index(['Marca', 'Modelo', 'Precio', 'Tipo_Combustible', 'Año', 'kms', 'CV',
       'N_Puertas', 'Tipo_Cambio', 'color', 'N_Fotos', 'Provincia', 'TDG'],
      dtype='object')

In [4]:
X = df.drop(['Precio'],axis=1)
y = df['Precio']
print(X.shape)
print(y.shape)

(41472, 12)
(41472,)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(33177, 12)
(8295, 12)
(33177,)
(8295,)


## Linear Regression

In [10]:
reg_mod = LinearRegression()
reg_mod.fit(X_train,y_train)

LinearRegression()

In [11]:
coef_reg_mod = pd.DataFrame(reg_mod.coef_,
                            X.columns,
                            columns=['Coeficient'])

coef_reg_mod.sort_values('Coeficient', ascending=False)

Unnamed: 0,Coeficient
Marca,3834.156828
Año,804.062385
color,320.375177
CV,144.537637
Modelo,0.178467
kms,-0.063524
Provincia,-5.122451
N_Fotos,-17.274696
TDG,-56.529458
Tipo_Cambio,-73.13822


In [12]:
predictions = reg_mod.predict(X_train)

In [13]:
MAE = mean_absolute_error(y_train, predictions)
MAPE = mean_absolute_percentage_error(y_train,predictions)
MSE = mean_squared_error(y_train,predictions)
RMSE = np.sqrt(mean_squared_error(y_train,predictions))
RS_SCORE = r2_score(y_train,predictions)
print("MAE: ", MAE)
print("MAPE: ", MAPE.round(2)*100, "%")
print("MSE: ", MSE)
print("RMSE: ", RMSE)
print("R2_Score",RS_SCORE)

MAE:  4585.698659119464
MAPE:  54.0 %
MSE:  83836711.89741418
RMSE:  9156.238960261695
R2_Score 0.6992203562833135


In [14]:
print("test MAE:", mean_absolute_error(y_test, reg_mod.predict(X_test)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, reg_mod.predict(X_test)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, reg_mod.predict(X_test)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, reg_mod.predict(X_test))).round(2))
print('RS_SCORE', r2_score(y_test,reg_mod.predict(X_test)).round(2))  

test MAE: 4768.59
test MAPE: 53.0 %
test MSE: 117283074.77
Test RMSE:10829.7300
RS_SCORE 0.67


In [15]:
#Guardo el modelo

with open('modelos/otros/lr_model','wb') as archivo_salida:
    pickle.dump(reg_mod,archivo_salida)

## Encontrando el mejor modelo con un Grid Search

In [16]:
parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}

grid = GridSearchCV(reg_mod, parameters, cv=5)
grid.fit(X_train, y_train)

print(grid.best_estimator_)
print(grid.best_params_)
print(grid.best_score_)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

LinearRegression(normalize=False)
{'copy_X': True, 'fit_intercept': True, 'normalize': False}
0.7006571174074228


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [17]:
print("El mejor estimator es",grid.best_estimator_)
print("Los mejores parámetros son",grid.best_params_)
print("El mejro score es",grid.best_score_)

El mejor estimator es LinearRegression(normalize=False)
Los mejores parámetros son {'copy_X': True, 'fit_intercept': True, 'normalize': False}
El mejro score es 0.7006571174074228


In [78]:
reg_mod_2 = LinearRegression(copy_X = True, fit_intercept= True, normalize=False)

In [79]:
reg_mod_2.fit(X_train,y_train)



LinearRegression(normalize=False)

In [80]:
predictions_2 = reg_mod_2.predict(X_train)

In [81]:
predictions_2

array([20287.4699548 , 14381.74437541,  5758.32875141, ...,
       22411.96677278,  -335.88075856,   810.56372142])

In [82]:
MAE_2 = mean_absolute_error(y_train, predictions_2)
MAPE_2 = mean_absolute_percentage_error(y_train,predictions_2)
MSE_2 = mean_squared_error(y_train,predictions_2)
RMSE_2 = np.sqrt(mean_squared_error(y_train,predictions_2))
RS_SCORE_2 = r2_score(y_train,predictions_2)
print("MAE: ", MAE_2)
print("MAPE: ", MAPE_2.round(2)*100, "%")
print("MSE: ", MSE_2)
print("RMSE: ", RMSE_2)
print("R2_Score",RS_SCORE_2)

MAE:  4585.698659119464
MAPE:  54.0 %
MSE:  83836711.89741418
RMSE:  9156.238960261695
R2_Score 0.6992203562833135


In [83]:
print("test MAE:", mean_absolute_error(y_test, reg_mod_2.predict(X_test)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, reg_mod_2.predict(X_test)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, reg_mod_2.predict(X_test)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, reg_mod_2.predict(X_test))).round(2))
print('RS_SCORE', r2_score(y_test,reg_mod_2.predict(X_test)).round(2))  

test MAE: 4768.59
test MAPE: 53.0 %
test MSE: 117283074.77
Test RMSE:10829.7300
RS_SCORE 0.67


In [None]:
comparacion = pd.DataFrame(predictions, columns=['prediction'])
comparacion['Prediction_GridSearch'] = predictions_2
comparacion['Diferencia'] = abs(comparacion['prediction'] - comparacion['Prediction_GridSearch'])
comparacion

Unnamed: 0,prediction,Prediction_GridSearch,Diferencia
0,20287.469955,20287.469955,0.0
1,14381.744375,14381.744375,0.0
2,5758.328751,5758.328751,0.0
3,20019.929463,20019.929463,0.0
4,19278.324336,19278.324336,0.0
...,...,...,...
33172,18919.702190,18919.702190,0.0
33173,19743.983241,19743.983241,0.0
33174,22411.966773,22411.966773,0.0
33175,-335.880759,-335.880759,0.0


# Polynomial Regression

In [16]:
poly_reg = PolynomialFeatures(degree=4)
poly_reg.fit(X_train)
X_poly_train = poly_reg.transform(X_train) 

pol_reg = LinearRegression()
pol_reg.fit(X_poly_train, y_train)


LinearRegression()

In [17]:
predictions_polynomial = pol_reg.predict(X_poly_train)
predictions_polynomial

array([17574.02038854, 12502.6809426 ,  7464.80885143, ...,
       18869.7840265 ,  4635.37571801, -1397.30565997])

In [18]:
MAE_pol = mean_absolute_error(y_train, predictions_polynomial)
MAPE_pol = mean_absolute_percentage_error(y_train,predictions_polynomial)
MSE_pol = mean_squared_error(y_train,predictions_polynomial)
RMSE_pol = np.sqrt(mean_squared_error(y_train,predictions_polynomial))
RS_SCORE_pol = r2_score(y_train,predictions_polynomial)
print("MAE: ", MAE_pol)
print("MAPE: ",MAPE_pol*100)
print("MSE: ", MSE_pol)
print("RMSE: ", RMSE_pol)
print("R2_Score",RS_SCORE_pol)

MAE:  3452.8543868993956
MAPE:  37.159355462057
MSE:  46631811.17479712
RMSE:  6828.748873314725
R2_Score 0.8326997894647645


In [19]:
poly_reg_test = PolynomialFeatures(degree=4)
poly_reg_test.fit(X_train)
X_poly_test = poly_reg.transform(X_test) 

pol_reg_test = LinearRegression()
pol_reg_test.fit(X_poly_test, y_test)

LinearRegression()

In [20]:
print("test MAE:", mean_absolute_error(y_test, pol_reg_test.predict(X_poly_test)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, pol_reg_test.predict(X_poly_test)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, pol_reg_test.predict(X_poly_test)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, pol_reg_test.predict(X_poly_test))).round(2))
print('RS_SCORE', r2_score(y_test,pol_reg_test.predict(X_poly_test)).round(2))  

test MAE: 2987.31
test MAPE: 35.0 %
test MSE: 25979246.74
Test RMSE:5096.9800
RS_SCORE 0.93


In [21]:
#Guardo el mejor modelo de polynomial

with open('modelos/otros/polynomial_model','wb') as archivo_salida:
    pickle.dump(pol_reg_test,archivo_salida)

In [None]:
scorer=make_scorer(mean_squared_error, greater_is_better = False)

In [None]:
params = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False],}
grid_search1 = GridSearchCV(pol_reg, params, scoring=scorer)
grid_search1.fit(X_poly_train,y_train)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

GridSearchCV(estimator=LinearRegression(),
             param_grid={'copy_X': [True, False],
                         'fit_intercept': [True, False],
                         'normalize': [True, False]},
             scoring=make_scorer(mean_squared_error, greater_is_better=False))

In [None]:
print("El mejor estimator es",grid_search1.best_estimator_)
print("Los mejores parámetros son",grid_search1.best_params_)
print("El mejor score es",grid_search1.best_score_)

El mejor estimator es LinearRegression(normalize=False)
Los mejores parámetros son {'copy_X': True, 'fit_intercept': True, 'normalize': False}
El mejor score es -21568686313.90991


In [None]:
predictions_pol_grid = grid_search1.predict(X_poly_train)
predictions_pol_grid

array([17574.02038854, 12502.6809426 ,  7464.80885143, ...,
       18869.7840265 ,  4635.37571801, -1397.30565997])

In [None]:
MAE_pol_grid = mean_absolute_error(y_train, predictions_pol_grid)
MAPE_pol_grid = mean_absolute_percentage_error(y_train,predictions_pol_grid)
MSE_pol_grid = mean_squared_error(y_train,predictions_pol_grid)
RMSE_pol_grid = np.sqrt(mean_squared_error(y_train,predictions_pol_grid))
RS_SCORE_pol_grid = r2_score(y_train,predictions_pol_grid)
print("MAE: ", MAE_pol_grid)
print("MAPE: ",MAPE_pol_grid)
print("MSE: ", MSE_pol_grid)
print("RMSE: ", RMSE_pol_grid)
print("R2_Score",RS_SCORE_pol_grid)

MAE:  3452.8543868993956
MAPE:  0.37159355462057003
MSE:  46631811.17479712
RMSE:  6828.748873314725
R2_Score 0.8326997894647645


In [None]:
comparacion_pol = pd.DataFrame(predictions_polynomial, columns=['prediction'])
comparacion_pol['Prediction_GridSearch'] = predictions_pol_grid
comparacion_pol['Diferencia'] = abs(comparacion['prediction'] - comparacion['Prediction_GridSearch'])
comparacion_pol

Unnamed: 0,prediction,Prediction_GridSearch,Diferencia
0,17574.020389,17574.020389,0.0
1,12502.680943,12502.680943,0.0
2,7464.808851,7464.808851,0.0
3,22199.466171,22199.466171,0.0
4,19013.212541,19013.212541,0.0
...,...,...,...
33172,16447.399010,16447.399010,0.0
33173,15357.387980,15357.387980,0.0
33174,18869.784026,18869.784026,0.0
33175,4635.375718,4635.375718,0.0


# Decision Tree

In [120]:
dtr = DecisionTreeRegressor(max_depth=5)
dtr.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=5)

In [121]:
dtr.feature_importances_

array([8.17636265e-03, 1.59358035e-02, 0.00000000e+00, 4.66674150e-04,
       5.44290915e-03, 5.68175715e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.01802536e-01])

In [122]:
dtr_feature_importance = pd.DataFrame(dtr.feature_importances_,
                            X_train.columns,
                            columns=['Feature Importance'])

dtr_feature_importance.sort_values('Feature Importance', ascending=False)

Unnamed: 0,Feature Importance
CV,0.57
TDG,0.4
Modelo,0.02
Marca,0.01
kms,0.01
Año,0.0
Tipo_Combustible,0.0
N_Puertas,0.0
Tipo_Cambio,0.0
color,0.0


In [123]:
predictions_dtr = dtr.predict(X_train)
predictions_dtr

array([26445.00532702, 12248.82144831,  8942.66402466, ...,
       19744.47208644,  4068.58078778,  1966.27403846])

In [124]:
MAE_dtr = mean_absolute_error(y_train, predictions_dtr)
MAPE_dtr = mean_absolute_percentage_error(y_train,predictions_dtr)
MSE_dtr = mean_squared_error(y_train,predictions_dtr)
RMSE_dtr = np.sqrt(mean_squared_error(y_train,predictions_dtr))
RS_SCORE_dtr = r2_score(y_train,predictions_dtr)
print("MAE: ", MAE_dtr)
print("MAPE: ",MAPE_dtr)
print("MSE: ", MSE_dtr)
print("RMSE: ", RMSE_dtr)
print("R2_Score",RS_SCORE_dtr)

MAE:  3462.5845015072646
MAPE:  0.23839335766876343
MSE:  39714403.43513876
RMSE:  6301.936482950202
R2_Score 0.8575172636749102


In [125]:
print("test MAE:", mean_absolute_error(y_test, dtr.predict(X_test)))
print("test MAPE:", mean_absolute_percentage_error(y_test, dtr.predict(X_test)))
print("test MSE:", mean_squared_error(y_test, dtr.predict(X_test)))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, dtr.predict(X_test))))
print('RS_SCORE', r2_score(y_test,dtr.predict(X_test)))

test MAE: 3612.530244730324
test MAPE: 0.24041511029530904
test MSE: 50188914.90606719
Test RMSE:7084.4135
RS_SCORE 0.8587991222790399


## Buscando el mejor modelo de Decision Tree

In [22]:
tree = DecisionTreeRegressor()

parameters = {'max_depth':[1,2,3,4,5,6,7,8,9,10],
              'min_samples_split':[2,3,4,5,6,7,8,9,10],
              'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10]}

grid_dtr = GridSearchCV(tree, parameters, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_dtr.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [126]:
print("El mejor estimator es",grid.best_estimator_)
print("Los mejores parámetros son",grid.best_params_)
print("El mejor score es",grid.best_score_)

El mejor estimator es LinearRegression(normalize=False)
Los mejores parámetros son {'copy_X': True, 'fit_intercept': True, 'normalize': False}
El mejor score es 0.7006571174074228


In [223]:
X_train.columns

Index(['Marca', 'Modelo', 'Tipo_Combustible', 'Año', 'kms', 'CV', 'N_Puertas',
       'Tipo_Cambio', 'color', 'N_Fotos', 'Provincia', 'TDG'],
      dtype='object')

In [24]:
predictions_dtr_gs = grid_dtr.predict(X_train)
predictions_dtr_gs

array([20503.08852459, 13460.99456522,  8848.64583333, ...,
       19221.43508772,  5131.81081081,  1191.68604651])

In [25]:
MAE_dtr_gs = mean_absolute_error(y_train, predictions_dtr_gs)
MAPE_dtr_gs = mean_absolute_percentage_error(y_train,predictions_dtr_gs)
MSE_dtr_gs = mean_squared_error(y_train,predictions_dtr_gs)
RMSE_dtr_gs = np.sqrt(mean_squared_error(y_train,predictions_dtr_gs))
RS_SCORE_dtr_gs = r2_score(y_train,predictions_dtr_gs)
print("MAE train: ", MAE_dtr_gs.round(2))
print("MAPE train: ",MAPE_dtr_gs.round(2)*100,"%")
print("MSE train: ", MSE_dtr_gs.round(2))
print("RMSE train: ", RMSE_dtr_gs.round(2))
print("R2_Score train",RS_SCORE_dtr_gs.round(2))

MAE train:  1679.8
MAPE train:  9.0 %
MSE train:  16999273.71
RMSE train:  4123.02
R2_Score train 0.94


In [26]:
print("test MAE:", mean_absolute_error(y_test, grid_dtr.predict(X_test)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, grid_dtr.predict(X_test)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, grid_dtr.predict(X_test)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, grid_dtr.predict(X_test))).round(2))
print('RS_SCORE', r2_score(y_test,grid_dtr.predict(X_test)).round(2))                                                                                                                                                                                                                                                 

test MAE: 1979.23
test MAPE: 10.0 %
test MSE: 35102027.57
Test RMSE:5924.7000
RS_SCORE 0.9


In [27]:
#Guardo el mejor modelo de Decision Tree

with open('modelos/otros/DecissionTree_GridSearch_model','wb') as archivo_salida:
    pickle.dump(grid_dtr,archivo_salida)

# KNN

In [28]:
scaler = StandardScaler()
X_train_scal = scaler.fit_transform(X_train) 
X_test_scal= scaler.transform(X_test)

# Probé primero Min Max scaler, pero obtuve peores resultados. Así que me quedo con StandarScaler

In [29]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scal,y_train)

KNeighborsRegressor()

In [30]:
predictions_knn = knn.predict(X_train_scal)

In [31]:
MAE_knn = mean_absolute_error(y_train, predictions_knn)
MAPE_knn = mean_absolute_percentage_error(y_train,predictions_knn)
MSE_knn = mean_squared_error(y_train,predictions_knn)
RMSE_knn = np.sqrt(mean_squared_error(y_train,predictions_knn))
RS_SCORE_knn = r2_score(y_train,predictions_knn)
print("MAE train: ", MAE_knn.round(2))
print("MAPE train: ",MAPE_knn.round(2)*100,"%")
print("MSE train: ", MSE_knn.round(2))
print("RMSE train: ", RMSE_knn.round(2))
print("R2_Score train",RS_SCORE_knn.round(2))

MAE train:  2575.62
MAPE train:  16.0 %
MSE train:  32216325.75
RMSE train:  5675.94
R2_Score train 0.88


In [32]:
print("test MAE:", mean_absolute_error(y_test, knn.predict(X_test_scal)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, knn.predict(X_test_scal)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, knn.predict(X_test_scal)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, knn.predict(X_test_scal))).round(2))
print('RS_SCORE', r2_score(y_test,knn.predict(X_test_scal)).round(2))   

test MAE: 3236.03
test MAPE: 19.0 %
test MSE: 55980274.91
Test RMSE:7482.0000
RS_SCORE 0.84


In [33]:
#Guardo el mejor modelo de KNN

with open('modelos/otros/KNN_model','wb') as archivo_salida:
    pickle.dump(knn,archivo_salida)

## Buscando el mejor KNN con Gridsearch CV

In [140]:
params = {"n_neighbors": np.arange(1, 10), 
          "weights": ["uniform", "distance"]}

grid_search_cv_knn = GridSearchCV(knn, params)
grid_search_cv_knn.fit(X_train_scal, y_train)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                         'weights': ['uniform', 'distance']})

In [141]:
print("El mejor estimator es",grid_search_cv_knn.best_estimator_)
print("Los mejores parámetros son",grid_search_cv_knn.best_params_)
print("El mejor score es",grid_search_cv_knn.best_score_)

El mejor estimator es KNeighborsRegressor(n_neighbors=9, weights='distance')
Los mejores parámetros son {'n_neighbors': 9, 'weights': 'distance'}
El mejor score es 0.8358707950007268


In [142]:
knn_gs = KNeighborsRegressor(n_neighbors=7,weights='uniform',leaf_size=50,n_jobs=-1)
knn_gs.fit(X_train_scal,y_train)
predictions_knn_gs = knn_gs.predict(X_train_scal)
# Con los parámetros que me da el gridsearch obtengo un modelo con overfitting

In [143]:
MAE_knn_gs = mean_absolute_error(y_train, predictions_knn_gs)
MAPE_knn_gs = mean_absolute_percentage_error(y_train,predictions_knn_gs)
MSE_knn_gs = mean_squared_error(y_train,predictions_knn_gs)
RMSE_knn_gs = np.sqrt(mean_squared_error(y_train,predictions_knn_gs))
RS_SCORE_knn_gs = r2_score(y_train,predictions_knn_gs)
print("MAE train: ", MAE_knn_gs.round(2))
print("MAPE train: ",MAPE_knn_gs.round(2)*100,"%")
print("MSE train: ", MSE_knn_gs.round(2))
print("RMSE train: ", RMSE_knn_gs.round(2))
print("R2_Score train",RS_SCORE_knn_gs.round(2))

MAE train:  2739.8
MAPE train:  17.0 %
MSE train:  36515164.64
RMSE train:  6042.78
R2_Score train 0.87


In [144]:
print("test MAE:", mean_absolute_error(y_test, knn_gs.predict(X_test_scal)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, knn_gs.predict(X_test_scal)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, knn_gs.predict(X_test_scal)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, knn_gs.predict(X_test_scal))).round(2))
print('RS_SCORE', r2_score(y_test,knn_gs.predict(X_test_scal)).round(2))   

test MAE: 3230.53
test MAPE: 19.0 %
test MSE: 53882272.34
Test RMSE:7340.4500
RS_SCORE 0.85


# SVM

In [201]:
scaler = StandardScaler()
X_train_scal = scaler.fit_transform(X_train) 
X_test_scal= scaler.transform(X_test)

# Min max scaler descarto, me da peores resultados

In [202]:
svm = SVR()

In [203]:
svm.fit(X_train_scal,y_train)

SVR()

In [204]:
predictions_svm = svm.predict(X_train_scal)

In [205]:
MAE_svm = mean_absolute_error(y_train, predictions_svm)
MAPE_svm = mean_absolute_percentage_error(y_train,predictions_svm)
MSE_svm = mean_squared_error(y_train,predictions_svm)
RMSE_svm = np.sqrt(mean_squared_error(y_train,predictions_svm))
RS_SCORE_svm = r2_score(y_train,predictions_svm)
print("MAE train: ", MAE_svm.round(2))
print("MAPE train: ",MAPE_svm.round(2)*100,"%")
print("MSE train: ", MSE_svm.round(2))
print("RMSE train: ", RMSE_svm.round(2))
print("R2_Score train",RS_SCORE_svm.round(2))

MAE train:  8651.46
MAPE train:  97.0 %
MSE train:  268815638.02
RMSE train:  16395.6
R2_Score train 0.04


In [187]:
print("test MAE:", mean_absolute_error(y_test, svm.predict(X_test_scal)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, svm.predict(X_test_scal)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, svm.predict(X_test_scal)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, svm.predict(X_test_scal))).round(2))
print('RS_SCORE', r2_score(y_test,svm.predict(X_test_scal)).round(2))  

test MAE: 8951.62
test MAPE: 97.0 %
test MSE: 348100364.87
Test RMSE:18657.4500
RS_SCORE 0.02


## Buscando el mejor modelo SVM con Gridsearch CV

In [188]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
clf = GridSearchCV(svm, parameters)
clf.fit(X_train_scal, y_train)

GridSearchCV(estimator=SVR(),
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})

In [189]:
print("El mejor estimator es",clf.best_estimator_)
print("Los mejores parámetros son",clf.best_params_)
print("El mejor score es",clf.best_score_)

El mejor estimator es SVR(C=10, kernel='linear')
Los mejores parámetros son {'C': 10, 'kernel': 'linear'}
El mejor score es 0.6558663341177673


In [34]:
svm_grid = SVR(C=10,kernel='linear')
svm_grid.fit(X_train_scal,y_train)

SVR(C=10, kernel='linear')

In [35]:
predictions_svm_grid = svm_grid.predict(X_train_scal)

In [36]:
MAE_svm_grid = mean_absolute_error(y_train, predictions_svm_grid)
MAPE_svm_grid = mean_absolute_percentage_error(y_train,predictions_svm_grid)
MSE_svm_grid = mean_squared_error(y_train,predictions_svm_grid)
RMSE_svm_grid = np.sqrt(mean_squared_error(y_train,predictions_svm_grid))
RS_SCORE_svm_grid = r2_score(y_train,predictions_svm_grid)
print("MAE train: ", MAE_svm_grid.round(2))
print("MAPE train: ",MAPE_svm_grid.round(2)*100,"%")
print("MSE train: ", MSE_svm_grid.round(2))
print("RMSE train: ", RMSE_svm_grid.round(2))
print("R2_Score train",RS_SCORE_svm_grid.round(2))

MAE train:  4231.66
MAPE train:  42.0 %
MSE train:  96511993.05
RMSE train:  9824.05
R2_Score train 0.65


In [37]:
print("test MAE:", mean_absolute_error(y_test, svm_grid.predict(X_test_scal)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, svm_grid.predict(X_test_scal)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, svm_grid.predict(X_test_scal)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, svm_grid.predict(X_test_scal))).round(2))
print('RS_SCORE', r2_score(y_test,svm_grid.predict(X_test_scal)).round(2))  

test MAE: 4458.96
test MAPE: 41.0 %
test MSE: 141716795.62
Test RMSE:11904.4900
RS_SCORE 0.6


In [38]:
#Guardo el mejor modelo de SVM

with open('modelos/otros/SVM_GridSearch_model','wb') as archivo_salida:
    pickle.dump(svm_grid,archivo_salida)

# Ensembles

#### Random Forest Regressor

In [8]:
rnd_reg = RandomForestRegressor(n_estimators=500,
                                 max_leaf_nodes=16,
                                 random_state=42)
rnd_reg.fit(X_train, y_train)

prections_rnd_reg = rnd_reg.predict(X_train)

In [9]:
MAE_rnd_reg = mean_absolute_error(y_train, prections_rnd_reg)
MAPE_rnd_reg = mean_absolute_percentage_error(y_train,prections_rnd_reg)
MSE_rnd_reg = mean_squared_error(y_train,prections_rnd_reg)
RMSE_rnd_reg = np.sqrt(mean_squared_error(y_train,prections_rnd_reg))
RS_SCORE_rnd_reg = r2_score(y_train,prections_rnd_reg)
print("MAE train: ", MAE_rnd_reg.round(2))
print("MAPE train: ",MAPE_rnd_reg.round(2)*100,"%")
print("MSE train: ", MSE_rnd_reg.round(2))
print("RMSE train: ", RMSE_rnd_reg.round(2))
print("R2_Score train",RS_SCORE_rnd_reg.round(2))

MAE train:  3609.72
MAPE train:  35.0 %
MSE train:  35116142.54
RMSE train:  5925.89
R2_Score train 0.87


In [11]:
print("test MAE:", mean_absolute_error(y_test, rnd_reg.predict(X_test)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, rnd_reg.predict(X_test)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, rnd_reg.predict(X_test)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, rnd_reg.predict(X_test))).round(2))
print('RS_SCORE', r2_score(y_test,rnd_reg.predict(X_test)).round(2))  

test MAE: 3728.7
test MAPE: 36.0 %
test MSE: 47247957.27
Test RMSE:6873.7100
RS_SCORE 0.87


#### GridSearch Random Forest Regressor

In [12]:
grid_par = {'n_estimators': range(1,10,1),'max_depth':range(1,10,1)}
modelo = RandomForestRegressor()
grid = GridSearchCV(modelo, grid_par,scoring='r2',cv=5)
grid.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': range(1, 10),
                         'n_estimators': range(1, 10)},
             scoring='r2')

In [13]:
print("El mejor estimator es",grid.best_estimator_)
print("Los mejores parámetros son",grid.best_params_)
print("El mejor score es",grid.best_score_)

El mejor estimator es RandomForestRegressor(max_depth=9, n_estimators=8)
Los mejores parámetros son {'max_depth': 9, 'n_estimators': 8}
El mejor score es 0.9202360297230919


In [39]:
grid = RandomForestRegressor(max_depth=9,n_estimators=8)
grid.fit(X_train,y_train)

RandomForestRegressor(max_depth=9, n_estimators=8)

In [40]:
prections_rnd_reg_grid = grid.predict(X_train)

In [41]:
MAE_rnd_reg_grid = mean_absolute_error(y_train, prections_rnd_reg_grid)
MAPE_rnd_reg_grid = mean_absolute_percentage_error(y_train,prections_rnd_reg_grid)
MSE_rnd_reg_grid = mean_squared_error(y_train,prections_rnd_reg_grid)
RMSE_rnd_reg_grid = np.sqrt(mean_squared_error(y_train,prections_rnd_reg_grid))
RS_SCORE_rnd_reg_grid = r2_score(y_train,prections_rnd_reg_grid)
print("MAE train: ", MAE_rnd_reg_grid.round(2))
print("MAPE train: ",MAPE_rnd_reg_grid.round(2)*100,"%")
print("MSE train: ", MSE_rnd_reg_grid.round(2))
print("RMSE train: ", RMSE_rnd_reg_grid.round(2))
print("R2_Score train",RS_SCORE_rnd_reg_grid.round(2))

MAE train:  1442.25
MAPE train:  8.0 %
MSE train:  9156415.09
RMSE train:  3025.96
R2_Score train 0.97


In [42]:
print("test MAE:", mean_absolute_error(y_test, grid.predict(X_test)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, grid.predict(X_test)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, grid.predict(X_test)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, grid.predict(X_test))).round(2))
print('RS_SCORE', r2_score(y_test,grid.predict(X_test)).round(2))  

test MAE: 1690.11
test MAPE: 9.0 %
test MSE: 22447050.25
Test RMSE:4737.8300
RS_SCORE 0.94


In [43]:
#Guardo el mejor modelo de Random Forest

with open('modelos/otros/RandomForest_GridSearch_model','wb') as archivo_salida:
    pickle.dump(grid,archivo_salida)

#### Ada Boost Regressor

In [18]:
ada_reg = AdaBoostRegressor(n_estimators=200,
                            random_state=42)
ada_reg.fit(X_train, y_train)

predictions_ada_reg = ada_reg.predict(X_train)

In [19]:
MAE_ada_reg = mean_absolute_error(y_train, predictions_ada_reg)
MAPE_ada_reg = mean_absolute_percentage_error(y_train,predictions_ada_reg)
MSE_ada_reg = mean_squared_error(y_train,predictions_ada_reg)
RMSE_ada_reg = np.sqrt(mean_squared_error(y_train,predictions_ada_reg))
RS_SCORE_ada_reg = r2_score(y_train,predictions_ada_reg)
print("MAE train: ", MAE_ada_reg.round(2))
print("MAPE train: ",MAPE_ada_reg.round(2)*100,"%")
print("MSE train: ", MSE_ada_reg.round(2))
print("RMSE train: ", RMSE_ada_reg.round(2))
print("R2_Score train",RS_SCORE_ada_reg.round(2))

MAE train:  22324.34
MAPE train:  322.0 %
MSE train:  541326925.13
RMSE train:  23266.43
R2_Score train -0.94


In [20]:
print("test MAE:", mean_absolute_error(y_test, ada_reg.predict(X_test)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, ada_reg.predict(X_test)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, ada_reg.predict(X_test)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, ada_reg.predict(X_test))).round(2))
print('RS_SCORE', r2_score(y_test,ada_reg.predict(X_test)).round(2))  

test MAE: 22312.09
test MAPE: 321.0 %
test MSE: 549193816.56
Test RMSE:23434.8800
RS_SCORE -0.55


#### GridSearch Ada Boost Regressor

In [180]:
grid_par_ada = {'n_estimators': range(1,10,1),'learning_rate':range(1,10,1)}
modelo_ada = AdaBoostRegressor(random_state=42)
grid_ada = GridSearchCV(modelo_ada, grid_par_ada,scoring='r2',cv=5)
grid_ada.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=AdaBoostRegressor(random_state=42),
             param_grid={'learning_rate': range(1, 10),
                         'n_estimators': range(1, 10)},
             scoring='r2')

In [44]:
grid_ada = AdaBoostRegressor(learning_rate=2, n_estimators=4,random_state=42)
grid_ada.fit(X_train,y_train)

AdaBoostRegressor(learning_rate=2, n_estimators=4, random_state=42)

In [45]:
predictions_ada_grid = grid_ada.predict(X_train)

In [182]:
print("El mejor estimator es",grid_ada.best_estimator_)
print("Los mejores parámetros son",grid_ada.best_params_)
print("El mejor score es",grid_ada.best_score_)

El mejor estimator es AdaBoostRegressor(learning_rate=2, n_estimators=4, random_state=42)
Los mejores parámetros son {'learning_rate': 2, 'n_estimators': 4}
El mejor score es 0.7518552213780858


In [46]:
MAE_ada_reg_grid = mean_absolute_error(y_train, predictions_ada_grid)
MAPE_ada_reg_grid = mean_absolute_percentage_error(y_train,predictions_ada_grid)
MSE_ada_reg_grid = mean_squared_error(y_train,predictions_ada_grid)
RMSE_ada_reg_grid = np.sqrt(mean_squared_error(y_train,predictions_ada_grid))
RS_SCORE_ada_reg_grid = r2_score(y_train,predictions_ada_grid)
print("MAE train: ", MAE_ada_reg_grid.round(2))
print("MAPE train: ",MAPE_ada_reg_grid.round(2)*100,"%")
print("MSE train: ", MSE_ada_reg_grid.round(2))
print("RMSE train: ", RMSE_ada_reg_grid.round(2))
print("R2_Score train",RS_SCORE_ada_reg_grid.round(2))

MAE train:  5171.36
MAPE train:  61.0 %
MSE train:  63609363.78
RMSE train:  7975.55
R2_Score train 0.77


In [47]:
print("test MAE:", mean_absolute_error(y_test, grid_ada.predict(X_test)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, grid_ada.predict(X_test)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, grid_ada.predict(X_test)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, grid_ada.predict(X_test))).round(2))
print('RS_SCORE', r2_score(y_test,grid_ada.predict(X_test)).round(2))  

test MAE: 5381.22
test MAPE: 61.0 %
test MSE: 87180701.39
Test RMSE:9337.0600
RS_SCORE 0.75


In [48]:
#Guardo el mejor modelo de Ada Boost

with open('modelos/otros/AdaBoostRegressor_GridSearch_model','wb') as archivo_salida:
    pickle.dump(grid_ada,archivo_salida)

#### Gradient Boosting

In [35]:
gbrt = GradientBoostingRegressor(max_depth=2,
                                 n_estimators=3, 
                                 learning_rate=1.0,
                                 random_state=42)
gbrt.fit(X_train, y_train)


predictions_gbrt = gbrt.predict(X_train)

In [36]:
MAE_gbrt = mean_absolute_error(y_train, predictions_gbrt)
MAPE_gbrt = mean_absolute_percentage_error(y_train,predictions_gbrt)
MSE_gbrt = mean_squared_error(y_train,predictions_gbrt)
RMSE_gbrt = np.sqrt(mean_squared_error(y_train,predictions_gbrt))
RS_SCORE_gbrt = r2_score(y_train,predictions_gbrt)
print("MAE train: ", MAE_gbrt.round(2))
print("MAPE train: ",MAPE_gbrt.round(2)*100,"%")
print("MSE train: ", MSE_gbrt.round(2))
print("RMSE train: ", RMSE_gbrt.round(2))
print("R2_Score train",RS_SCORE_gbrt.round(2))

MAE train:  5566.54
MAPE train:  55.00000000000001 %
MSE train:  76054100.98
RMSE train:  8720.9
R2_Score train 0.73


In [37]:
print("test MAE:", mean_absolute_error(y_test, gbrt.predict(X_test)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, gbrt.predict(X_test)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, gbrt.predict(X_test)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, gbrt.predict(X_test))).round(2))
print('RS_SCORE', r2_score(y_test,gbrt.predict(X_test)).round(2))  

test MAE: 5746.74
test MAPE: 56.00000000000001 %
test MSE: 101788663.69
Test RMSE:10089.0400
RS_SCORE 0.71


#### Grid Search Gradient Boosting

In [38]:
params = {'n_estimators': [50, 100, 150, 200],
          'max_depth': [2, 4, 6],
          'min_samples_split': [2, 4, 6],
          'learning_rate': [0.1, 0.2, 0.3]}


grid_gbrt = GridSearchCV(gbrt, params,scoring='r2')
grid_gbrt.fit(X_train,y_train)

GridSearchCV(estimator=GradientBoostingRegressor(learning_rate=1.0, max_depth=2,
                                                 n_estimators=3,
                                                 random_state=42),
             param_grid={'learning_rate': [0.1, 0.2, 0.3],
                         'max_depth': [2, 4, 6], 'min_samples_split': [2, 4, 6],
                         'n_estimators': [50, 100, 150, 200]},
             scoring='r2')

In [39]:
print("El mejor estimator es",grid_gbrt.best_estimator_)
print("Los mejores parámetros son",grid_gbrt.best_params_)
print("El mejor score es",grid_gbrt.best_score_)

El mejor estimator es GradientBoostingRegressor(learning_rate=0.2, max_depth=4, n_estimators=200,
                          random_state=42)
Los mejores parámetros son {'learning_rate': 0.2, 'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 200}
El mejor score es 0.9625614919287774


In [49]:
gbrt_grid = GradientBoostingRegressor(max_depth=4,
                                 n_estimators=200, 
                                 learning_rate=0.2,
                                 random_state=42,
                                 min_samples_split=2)
gbrt_grid.fit(X_train, y_train)

predictions_gbrt_grid = gbrt_grid.predict(X_train)

In [50]:
MAE_gbrt_grid = mean_absolute_error(y_train, predictions_gbrt_grid)
MAPE_gbrt_grid = mean_absolute_percentage_error(y_train,predictions_gbrt_grid)
MSE_gbrt_grid = mean_squared_error(y_train,predictions_gbrt_grid)
RMSE_gbrt_grid = np.sqrt(mean_squared_error(y_train,predictions_gbrt_grid))
RS_SCORE_gbrt_grid = r2_score(y_train,predictions_gbrt_grid)
print("MAE train: ", MAE_gbrt_grid.round(2))
print("MAPE train: ",MAPE_gbrt_grid.round(2)*100,"%")
print("MSE train: ", MSE_gbrt_grid.round(2))
print("RMSE train: ", RMSE_gbrt_grid.round(2))
print("R2_Score train",RS_SCORE_gbrt_grid.round(2))

MAE train:  808.61
MAPE train:  6.0 %
MSE train:  1828236.69
RMSE train:  1352.12
R2_Score train 0.99


In [51]:
print("test MAE:", mean_absolute_error(y_test, gbrt_grid.predict(X_test)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, gbrt_grid.predict(X_test)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, gbrt_grid.predict(X_test)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, gbrt_grid.predict(X_test))).round(2))
print('RS_SCORE', r2_score(y_test,gbrt_grid.predict(X_test)).round(2))  

test MAE: 1089.43
test MAPE: 6.0 %
test MSE: 14501098.59
Test RMSE:3808.0300
RS_SCORE 0.96


In [70]:
#Guardo el mejor modelo de Gradient, que finalmente es mi modelo elegido

with open('modelos/final/my_model','wb') as archivo_salida:
    pickle.dump(gbrt_grid,archivo_salida)

#### XGB

In [53]:
xgb_reg = xgboost.XGBRegressor(random_state=42)

xgb_reg.fit(X_train, y_train)
predictions_xgb = xgb_reg.predict(X_train)

In [6]:
xgb_reg = xgboost.XGBRegressor(random_state=42)

xgb_reg.fit(X_train.values, y_train)
predictions_xgb = xgb_reg.predict(X_train)

In [8]:
MAE_xgb = mean_absolute_error(y_train, predictions_xgb)
MAPE_xgb = mean_absolute_percentage_error(y_train,predictions_xgb)
MSE_xgb = mean_squared_error(y_train,predictions_xgb)
RMSE_xgb = np.sqrt(mean_squared_error(y_train,predictions_xgb))
RS_SCORE_xgb = r2_score(y_train,predictions_xgb)
print("MAE train: ", MAE_xgb.round(2))
print("MAPE train: ",MAPE_xgb.round(2)*100,"%")
print("MSE train: ", MSE_xgb.round(2))
print("RMSE train: ", RMSE_xgb.round(2))
print("R2_Score train",RS_SCORE_xgb.round(2))

MAE train:  480.94
MAPE train:  4.0 %
MSE train:  565255.12
RMSE train:  751.83
R2_Score train 1.0


In [7]:
print("test MAE:", mean_absolute_error(y_test, xgb_reg.predict(X_test)).round(2))
print("test MAPE:", mean_absolute_percentage_error(y_test, xgb_reg.predict(X_test)).round(2)*100, "%")
print("test MSE:", mean_squared_error(y_test, xgb_reg.predict(X_test)).round(2))
print("Test RMSE:%0.4f"% np.sqrt(mean_squared_error(y_test, xgb_reg.predict(X_test))).round(2))
print('RS_SCORE', r2_score(y_test,xgb_reg.predict(X_test)).round(2)) 

test MAE: 903.96
test MAPE: 5.0 %
test MSE: 17031170.04
Test RMSE:4126.8800
RS_SCORE 0.95


In [10]:
#Guardo el mejor modelo de xgb

with open('model/xgb_reg','wb') as archivo_salida:
    pickle.dump(xgb_reg,archivo_salida)

#### Agrupo los mejores resultados de todos los modelos probados en un dataframe

In [84]:
resultados = {'Métrica':['MAE','MAPE','MSE','RMSE','R2'],
            'Linear_R':[
            mean_absolute_error(y_test, reg_mod_2.predict(X_test)),
            mean_absolute_percentage_error(y_test, reg_mod_2.predict(X_test))*100,
            mean_squared_error(y_test, reg_mod_2.predict(X_test)),
            np.sqrt(mean_squared_error(y_test, reg_mod_2.predict(X_test))),
            r2_score(y_test,reg_mod_2.predict(X_test))],
            'Polynomial_R': [mean_absolute_error(y_test, pol_reg_test.predict(X_poly_test)),
                  mean_absolute_percentage_error(y_test, pol_reg_test.predict(X_poly_test))*100,
                  mean_squared_error(y_test, pol_reg_test.predict(X_poly_test)),
                  np.sqrt(mean_squared_error(y_test, pol_reg_test.predict(X_poly_test))),
                  r2_score(y_test,pol_reg_test.predict(X_poly_test))]
            
            }

resultados = pd.DataFrame(resultados)

resultados['Decision_Tree_Grid'] = [mean_absolute_error(y_test, grid_dtr.predict(X_test)),
                                mean_absolute_percentage_error(y_test, grid_dtr.predict(X_test))*100,
                                mean_squared_error(y_test, grid_dtr.predict(X_test)),
                                np.sqrt(mean_squared_error(y_test, grid_dtr.predict(X_test))),
                                r2_score(y_test,grid_dtr.predict(X_test))

]

resultados['KNN'] = [mean_absolute_error(y_test, knn.predict(X_test_scal)),
                    mean_absolute_percentage_error(y_test, knn.predict(X_test_scal))*100,
                    mean_squared_error(y_test, knn.predict(X_test_scal)),
                    np.sqrt(mean_squared_error(y_test, knn.predict(X_test_scal))),
                    r2_score(y_test,knn.predict(X_test_scal))  
]

resultados['SVM_Grid'] = [mean_absolute_error(y_test, svm_grid.predict(X_test_scal)),
                        mean_absolute_percentage_error(y_test, svm_grid.predict(X_test_scal))*100,
                        mean_squared_error(y_test, svm_grid.predict(X_test_scal)),
                        np.sqrt(mean_squared_error(y_test, svm_grid.predict(X_test_scal))),
                        r2_score(y_test,svm_grid.predict(X_test_scal))
                        ]

resultados['RFR_Grid'] = [mean_absolute_error(y_test, grid.predict(X_test)),
                        mean_absolute_percentage_error(y_test, grid.predict(X_test))*100,
                        mean_squared_error(y_test, grid.predict(X_test)),
                        np.sqrt(mean_squared_error(y_test, grid.predict(X_test))),
                        r2_score(y_test,grid.predict(X_test))
                        ]

resultados['ADA_Boost_Grid'] = [mean_absolute_error(y_test, grid_ada.predict(X_test)),
                                mean_absolute_percentage_error(y_test, grid_ada.predict(X_test))*100,
                                mean_squared_error(y_test, grid_ada.predict(X_test)),
                                np.sqrt(mean_squared_error(y_test, grid_ada.predict(X_test))),
                                r2_score(y_test,grid_ada.predict(X_test))
                                ]

resultados['Gradient_Boosting_Grid'] = [mean_absolute_error(y_test, gbrt_grid.predict(X_test)),
                                        mean_absolute_percentage_error(y_test, gbrt_grid.predict(X_test))*100,
                                        mean_squared_error(y_test, gbrt_grid.predict(X_test)),
                                        np.sqrt(mean_squared_error(y_test, gbrt_grid.predict(X_test))),
                                        r2_score(y_test,gbrt_grid.predict(X_test))]

resultados['xgb_grid'] = [
                    mean_absolute_error(y_test, xgb_reg.predict(X_test)),
                    mean_absolute_percentage_error(y_test, xgb_reg.predict(X_test))*100,
                    mean_squared_error(y_test, xgb_reg.predict(X_test)),
                    np.sqrt(mean_squared_error(y_test, xgb_reg.predict(X_test))),
                    r2_score(y_test,xgb_reg.predict(X_test))
]



resultados

Unnamed: 0,Métrica,Linear_R,Polynomial_R,Decision_Tree_Grid,KNN,SVM_Grid,RFR_Grid,ADA_Boost_Grid,Gradient_Boosting_Grid,xgb_grid
0,MAE,4768.59,2987.31,1979.23,3236.03,4458.96,1690.11,5381.22,1089.43,903.96
1,MAPE,52.63,35.38,9.94,19.02,40.54,8.68,61.11,6.5,4.54
2,MSE,117283074.77,25979246.74,35102027.57,55980274.91,141716795.62,22447050.25,87180701.39,14501098.59,17031170.04
3,RMSE,10829.73,5096.98,5924.7,7482.0,11904.49,4737.83,9337.06,3808.03,4126.88
4,R2,0.67,0.93,0.9,0.84,0.6,0.94,0.75,0.96,0.95


#### Analizando con otra métrica más (MAX_ERROR), para elegir entre el modelo Gradient Boosting y XGB

In [67]:
r_gradientvsxgb = {'Métrica': ['MAE','MAPE','MSE','RMSE','R2','MAX_ERROR'],
                'Gradient_Boosting_Grid':[mean_absolute_error(y_test, gbrt_grid.predict(X_test)),
                                        mean_absolute_percentage_error(y_test, gbrt_grid.predict(X_test))*100,
                                        mean_squared_error(y_test, gbrt_grid.predict(X_test)),
                                        np.sqrt(mean_squared_error(y_test, gbrt_grid.predict(X_test))),
                                        r2_score(y_test,gbrt_grid.predict(X_test)),
                                        max_error(y_test,gbrt_grid.predict(X_test))

                    
                ]}
r_gradientvsxgb = pd.DataFrame(r_gradientvsxgb)
r_gradientvsxgb['xgb_grid'] = [
                    mean_absolute_error(y_test, xgb_reg.predict(X_test)),
                    mean_absolute_percentage_error(y_test, xgb_reg.predict(X_test))*100,
                    mean_squared_error(y_test, xgb_reg.predict(X_test)),
                    np.sqrt(mean_squared_error(y_test, xgb_reg.predict(X_test))),
                    r2_score(y_test,xgb_reg.predict(X_test)),
                    max_error(y_test,xgb_reg.predict(X_test))
                    ]
r_gradientvsxgb

Unnamed: 0,Métrica,Gradient_Boosting_Grid,xgb_grid
0,MAE,1089.43,903.96
1,MAPE,6.5,4.54
2,MSE,14501098.59,17031170.04
3,RMSE,3808.03,4126.88
4,R2,0.96,0.95
5,MAX_ERROR,189550.43,194864.88


> Tras analizar ambas métricas con un parámetro más, decido quedarme con Gradient Boosting, ya que los errores máximos son menores. En este caso que nuestro modelo trata de la predicción de precios de venta de coches de segunnda, por lo cual se podría implementar en portales de compra ventas de coches, lo más interesante es que el error máximo sea el menor posible.

In [None]:
# Guardo el mejor modelo, listo para poner en producción

with open('model/final/my_model','wb') as archivo_salida:
    pickle.dump(gbrt_grid,archivo_salida)

In [15]:
with open('model/my_model', 'rb') as archivo_entrada:
    model = pickle.load(archivo_entrada)

In [16]:
model.predict(X_train)

array([16438.63852937, 12330.58108617,  8541.64338995, ...,
       22590.45365441,  4745.19829956,  1590.82146535])