# Librerias

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest,RFE, f_regression
from sklearn.decomposition import PCA
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)
pd.options.display.max_columns = None

In [95]:
''' 
Lectura de datos de restaurantes de Madrid.
'''
restaurantes = pd.read_csv('../data/processed/restaurantes.csv')

In [96]:
restaurantes.drop(['nombre_restaurante', 'place_id', 
                   'direccion', 'tipo_cocina',
                   'rating', 'user_ratings_total'
                   ], inplace=True, axis=1)

# Reg Lineales

In [6]:
X = restaurantes.drop('y', axis=1)

y = restaurantes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('pca', PCA(n_components=100)),
    ('classifier', LinearRegression())
])

linear_params = {
    'scaler':[MinMaxScaler(), StandardScaler(), 'passthrough'],
    'poly__degree':[1],
    'pca__n_components':np.arange(1, 172, 5),
    'classifier': [LinearRegression()]
}

regularizacion_params = {
    'scaler': [MinMaxScaler(), StandardScaler(), 'passthrough'],
    'poly__degree':[1, 2, 3, 4, 5],
    'pca__n_components':np.arange(1, 172, 5),
    'classifier': [Ridge(), Lasso()],
    'classifier__alpha': [0.25, 0,75, 0.90, 1, 100]
}

elastic_param = {
    'scaler': [MinMaxScaler(), StandardScaler(), 'passthrough'],
    'poly__degree':[1, 2, 3, 4, 5],
    'pca__n_components':np.arange(1, 172, 5),
    'classifier': [ElasticNet()],
    'classifier__alpha': [0.25, 0,75, 0.90, 1, 100],
    'classifier__l1_ratio': [0.1, 0.25, 0.50, 0.75, 0.80, 1]
}

search_space = [
    linear_params,
    regularizacion_params,
    elastic_param
]

clf2 = RandomizedSearchCV(estimator = pipe,
                  param_distributions = search_space,
                  n_iter=100,
                  scoring='neg_mean_absolute_error',
                  cv = 5,
                  verbose=2)

clf2.fit(X_train, y_train)

print(clf2.best_estimator_)
print(clf2.best_score_)
print(clf2.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END classifier=ElasticNet(), classifier__alpha=1, classifier__l1_ratio=0.5, pca__n_components=81, poly__degree=3, scaler=StandardScaler(); total time= 2.0min
[CV] END classifier=ElasticNet(), classifier__alpha=1, classifier__l1_ratio=0.5, pca__n_components=81, poly__degree=3, scaler=StandardScaler(); total time= 1.8min
[CV] END classifier=ElasticNet(), classifier__alpha=1, classifier__l1_ratio=0.5, pca__n_components=81, poly__degree=3, scaler=StandardScaler(); total time= 1.8min
[CV] END classifier=ElasticNet(), classifier__alpha=1, classifier__l1_ratio=0.5, pca__n_components=81, poly__degree=3, scaler=StandardScaler(); total time= 1.8min
[CV] END classifier=ElasticNet(), classifier__alpha=1, classifier__l1_ratio=0.5, pca__n_components=81, poly__degree=3, scaler=StandardScaler(); total time= 1.8min
[CV] END classifier=ElasticNet(), classifier__alpha=75, classifier__l1_ratio=0.1, pca__n_components=11, poly__degree=1, sc

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.8, pca__n_components=1, poly__degree=1, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.8, pca__n_components=1, poly__degree=1, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.8, pca__n_components=1, poly__degree=1, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.8, pca__n_components=1, poly__degree=1, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0.9, classifier__l1_ratio=0.75, pca__n_components=76, poly__degree=5, scaler=passthrough; total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0.9, classifier__l1_ratio=0.75, pca__n_components=76, poly__degree=5, scaler=passthrough; total time=   0.0s
[CV] END classifier=ElasticNet

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.25, pca__n_components=156, poly__degree=3, scaler=passthrough; total time= 2.8min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.25, pca__n_components=156, poly__degree=3, scaler=passthrough; total time= 2.9min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.25, pca__n_components=156, poly__degree=3, scaler=passthrough; total time= 2.8min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.25, pca__n_components=156, poly__degree=3, scaler=passthrough; total time= 3.0min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.25, pca__n_components=156, poly__degree=3, scaler=passthrough; total time= 2.7min
[CV] END classifier=ElasticNet(), classifier__alpha=75, classifier__l1_ratio=0.5, pca__n_components=96, poly__degree=4, scaler=MinMaxScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=75, classifier__l1_ratio=0.5, pca__n_components=96, poly__degree=4, scaler=MinMaxScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=75, classifier__l1_ratio=0.5, pca__n_components=96, poly__degree=4, scaler=MinMaxScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=75, classifier__l1_ratio=0.5, pca__n_components=96, poly__degree=4, scaler=MinMaxScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=75, classifier__l1_ratio=0.5, pca__n_components=96, poly__degree=4, scaler=MinMaxScaler(); total time=   0.0s
[CV] END classifier=ElasticNet

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.1, pca__n_components=81, poly__degree=1, scaler=passthrough; total time=   0.2s
[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.1, pca__n_components=81, poly__degree=1, scaler=passthrough; total time=   0.1s


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.1, pca__n_components=81, poly__degree=1, scaler=passthrough; total time=   0.2s


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.1, pca__n_components=81, poly__degree=1, scaler=passthrough; total time=   0.1s
[CV] END classifier=Ridge(), classifier__alpha=100, pca__n_components=156, poly__degree=5, scaler=passthrough; total time=   0.0s
[CV] END classifier=Ridge(), classifier__alpha=100, pca__n_components=156, poly__degree=5, scaler=passthrough; total time=   0.0s
[CV] END classifier=Ridge(), classifier__alpha=100, pca__n_components=156, poly__degree=5, scaler=passthrough; total time=   0.0s
[CV] END classifier=Ridge(), classifier__alpha=100, pca__n_components=156, poly__degree=5, scaler=passthrough; total time=   0.0s
[CV] END classifier=Ridge(), classifier__alpha=100, pca__n_components=156, poly__degree=5, scaler=passthrough; total time=   0.0s
[CV] END classifier=Ridge(), classifier__alpha=0.9, pca__n_components=106, poly__degree=2, scaler=passthrough; total time=   2.6s
[CV] END classifier=Ridge(), classifier__alpha=0.9, pca__n_com

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.8, pca__n_components=76, poly__degree=3, scaler=passthrough; total time= 1.5min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.8, pca__n_components=76, poly__degree=3, scaler=passthrough; total time= 1.6min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.8, pca__n_components=76, poly__degree=3, scaler=passthrough; total time= 1.5min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.8, pca__n_components=76, poly__degree=3, scaler=passthrough; total time= 1.5min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.8, pca__n_components=76, poly__degree=3, scaler=passthrough; total time= 1.5min
[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=36, poly__degree=4, scaler=passthrough; total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=36, poly__degree=4, scaler=passthrough; total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=36, poly__degree=4, scaler=passthrough; total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=36, poly__degree=4, scaler=passthrough; total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=36, poly__degree=4, scaler=passthrough; total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.1, pca__n_components=51, poly__degree=3, scaler=StandardScaler(); total time= 1.2min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.1, pca__n_components=51, poly__degree=3, scaler=StandardScaler(); total time= 1.2min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.1, pca__n_components=51, poly__degree=3, scaler=StandardScaler(); total time= 1.2min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.1, pca__n_components=51, poly__degree=3, scaler=StandardScaler(); total time= 1.2min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.1, pca__n_components=51, poly__degree=3, scaler=StandardScaler(); total time= 1.2min
[CV] END classifier=Ridge(), classifier__alpha=75, pca__n_components=66, poly__degree=5, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=Ridge(), classifier__alpha=75, pca__n_components=66, poly__degree=5, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=Ridge(), classifier__alpha=75, pca__n_components=66, poly__degree=5, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=Ridge(), classifier__alpha=75, pca__n_components=66, poly__degree=5, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=Ridge(), classifier__alpha=75, pca__n_components=66, poly__degree=5, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0.9, classifier__l1_ratio=1, pca__n_components=41, poly__degree=3, scaler=passthrough; total time= 1.1min
[CV] END class

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.25, pca__n_components=1, poly__degree=3, scaler=StandardScaler(); total time=  37.9s


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.25, pca__n_components=1, poly__degree=3, scaler=StandardScaler(); total time=  38.9s


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.25, pca__n_components=1, poly__degree=3, scaler=StandardScaler(); total time=  38.9s


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.25, pca__n_components=1, poly__degree=3, scaler=StandardScaler(); total time=  38.0s


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.25, pca__n_components=1, poly__degree=3, scaler=StandardScaler(); total time=  39.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0.25, classifier__l1_ratio=0.8, pca__n_components=46, poly__degree=3, scaler=passthrough; total time= 1.1min
[CV] END classifier=ElasticNet(), classifier__alpha=0.25, classifier__l1_ratio=0.8, pca__n_components=46, poly__degree=3, scaler=passthrough; total time= 1.1min
[CV] END classifier=ElasticNet(), classifier__alpha=0.25, classifier__l1_ratio=0.8, pca__n_components=46, poly__degree=3, scaler=passthrough; total time= 1.1min
[CV] END classifier=ElasticNet(), classifier__alpha=0.25, classifier__l1_ratio=0.8, pca__n_components=46, poly__degree=3, scaler=passthrough; total time= 1.1min
[CV] END classifier=ElasticNet(), classifier__alpha=0.25, classifier__l1_ratio=0.8, pca__n_components=46, poly__degree=3, scaler=passthrough; total time= 1.1min
[CV] END classifier=Ridge(), cla

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=161, poly__degree=3, scaler=passthrough; total time= 2.8min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=161, poly__degree=3, scaler=passthrough; total time= 2.7min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=161, poly__degree=3, scaler=passthrough; total time= 2.8min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=161, poly__degree=3, scaler=passthrough; total time= 2.8min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=161, poly__degree=3, scaler=passthrough; total time= 2.8min
[CV] END classifier=ElasticNet(), classifier__alpha=100, classifier__l1_ratio=0.1, pca__n_components=36, poly__degree=2, scaler=StandardScaler(); total time=   1.0s
[CV] END classifier=ElasticNet(), classifier__alpha=100, classifier__l1_ratio=0.1, pca__n_components=36, poly__degree=2, scaler=StandardScaler(); total time=   1.0s
[CV] END classifier=ElasticNet(), classifier__alpha=100, classifier__l1_ratio=0.1, pca__n_components=36, poly__degree=2, scaler=StandardScaler(); total time=   1.0s
[CV] END classifier=ElasticNet(), classifier__alpha=100, classifier__l1_ratio=0.1, pca__n_components=36, poly__degree=2, scaler=StandardScaler(); total time=   0.9s
[CV] END classifier=ElasticNet(), classifier__alpha=100, classifier__l1_ratio=0.1, pca__n_components=36, poly__degree=2, scaler=StandardScaler(); total time=   1.0s
[CV] END classif

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=41, poly__degree=3, scaler=MinMaxScaler(); total time= 1.1min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=41, poly__degree=3, scaler=MinMaxScaler(); total time= 1.1min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=41, poly__degree=3, scaler=MinMaxScaler(); total time= 1.1min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=41, poly__degree=3, scaler=MinMaxScaler(); total time= 1.1min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END classifier=ElasticNet(), classifier__alpha=0, classifier__l1_ratio=0.5, pca__n_components=41, poly__degree=3, scaler=MinMaxScaler(); total time= 1.1min
[CV] END classifier=ElasticNet(), classifier__alpha=0.9, classifier__l1_ratio=0.1, pca__n_components=51, poly__degree=4, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0.9, classifier__l1_ratio=0.1, pca__n_components=51, poly__degree=4, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0.9, classifier__l1_ratio=0.1, pca__n_components=51, poly__degree=4, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0.9, classifier__l1_ratio=0.1, pca__n_components=51, poly__degree=4, scaler=StandardScaler(); total time=   0.0s
[CV] END classifier=ElasticNet(), classifier__alpha=0.9, classifier__l1_ratio=0.1, pca__n_components=51, poly__degree=4, scaler=StandardScaler(); total time=   0.0s
[CV] END class

225 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\karli\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\karli\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\karli\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('poly', PolynomialFeatures(degree=3, include_bias=False)),
                ('pca', PCA(n_components=np.int64(71))),
                ('classifier', Ridge(alpha=0))])
-5.245820759955924
{'scaler': MinMaxScaler(), 'poly__degree': 3, 'pca__n_components': np.int64(71), 'classifier__alpha': 0, 'classifier': Ridge()}


In [7]:
best2 = clf2.best_estimator_
predictions_best2 = best2.predict(X_test)

print("MAE test", mean_absolute_error(y_test, predictions_best2))
print("MAPE test", mean_absolute_percentage_error(y_test, predictions_best2))
print("MSE test", mean_squared_error(y_test, predictions_best2))
print("RMSE test", mean_squared_error(y_test, predictions_best2)**(1/2))
print("R2 score", r2_score(y_test, predictions_best2))

MAE test 5.0601442703441455
MAPE test 0.3520906381606369
MSE test 41.27723167736816
RMSE test 6.42473592277287
R2 score 0.31507579193914714


In [8]:
filename = '../models/2_ridge_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best2, archivo_salida)

# Random Forest

In [74]:
X = restaurantes.drop('y', axis=1)

y = restaurantes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=104)),
    ('classifier', RandomForestRegressor(random_state=42))])

rf_params = {
    'scaler': [MinMaxScaler(), StandardScaler(), 'passthrough'],
    'classifier': [RandomForestRegressor(random_state=42)],
    'classifier__max_depth': [7, 5, 10],
    'classifier__min_samples_leaf': [20, 30]
}


search_space = [
    rf_params
]

clf  = RandomizedSearchCV(estimator = pipe,
                  param_distributions= search_space,
                  n_iter= 150,
                  scoring='neg_mean_absolute_error',
                  cv = 10,
                  n_jobs=-1)

clf.fit(X_train, y_train)

print(clf.best_estimator_)
print(clf.best_score_)
print(clf.best_params_)



Pipeline(steps=[('scaler', 'passthrough'), ('pca', PCA(n_components=104)),
                ('classifier',
                 RandomForestRegressor(max_depth=10, min_samples_leaf=20,
                                       random_state=42))])
-5.1237277382022475
{'scaler': 'passthrough', 'classifier__min_samples_leaf': 20, 'classifier__max_depth': 10, 'classifier': RandomForestRegressor(random_state=42)}


In [75]:
print(clf.best_estimator_.named_steps['classifier'].feature_importances_)
print(X.columns)

[0.11705673 0.12726154 0.01540085 0.00352184 0.00886677 0.00667969
 0.00209989 0.00528339 0.0033973  0.0029698  0.00583565 0.00775611
 0.00255515 0.00247216 0.01066868 0.21127753 0.01591742 0.00580578
 0.02606705 0.0588527  0.00647696 0.01264301 0.01714871 0.005765
 0.00565056 0.01255987 0.01109874 0.00637508 0.0067054  0.00897316
 0.00775487 0.00247513 0.00239825 0.00388465 0.00298157 0.00748951
 0.00415118 0.00345798 0.00395436 0.0059711  0.00379282 0.00362848
 0.00266794 0.00312955 0.00246715 0.00325625 0.00326978 0.00205561
 0.00288698 0.00754531 0.00489576 0.00199871 0.00253758 0.00138319
 0.00226819 0.00163202 0.00295794 0.00232464 0.00300753 0.00189977
 0.0026164  0.00317404 0.00208828 0.0027764  0.00226719 0.00114753
 0.008643   0.00261082 0.00197635 0.01033944 0.00303457 0.00769186
 0.00908294 0.00267035 0.00231703 0.00376911 0.00563454 0.00218209
 0.00391954 0.00201087 0.00487494 0.00376038 0.00272441 0.00554069
 0.00490353 0.00461545 0.00162261 0.00287112 0.0019495  0.003719

In [77]:
best1 = clf.best_estimator_
predictions_best1 = best1.predict(X_test)

print("MAE test", mean_absolute_error(y_test, predictions_best1))
print("MAPE test", mean_absolute_percentage_error(y_test, predictions_best1))
print("MSE test", mean_squared_error(y_test, predictions_best1))
print("RMSE test", mean_squared_error(y_test, predictions_best1)**(1/2))
print("R2 score", r2_score(y_test, predictions_best1))

MAE test 5.05098047620047
MAPE test 0.34651323761188374
MSE test 40.96849594417946
RMSE test 6.400663711223975
R2 score 0.3201987270043466


In [78]:
filename = '../models/1_randomforest_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best1, archivo_salida)

# Gradient Boosting

In [28]:
X = restaurantes[['serves_breakfast', 
                  'tasa_parados', 
                  'dur_media_credito_viviendas', 
                  'poblacion_80_mas',
                  'poblacion_china',
                  'pct_crecimiento_demografico',
                  'rating_mean',
                  'poblacion_italia',
                  'user_ratings_mean',
                  'price_level',
                  'tipo_cocina_encoder',
                  'cod_barrio'
                  ]]

y = restaurantes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GradientBoostingRegressor(random_state=42))])

gboost_param = {
    'scaler': [StandardScaler(), 'passthrough'],
    'classifier': [GradientBoostingRegressor(random_state=42)],
    'classifier__learning_rate': [0.25, 0.3, 0.5],
    'classifier__max_depth': [3, 4],
    'classifier__min_samples_leaf': [20, 30, 40],
    'classifier__n_estimators':[100]
}


search_space = [
    gboost_param
]

clf3 = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10,
                  n_jobs=-1,
                  verbose=3)

clf3.fit(X_train, y_train)

print(clf3.best_estimator_)
print(clf3.best_score_)
print(clf3.best_params_)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
Pipeline(steps=[('scaler', 'passthrough'),
                ('classifier',
                 GradientBoostingRegressor(learning_rate=0.25,
                                           min_samples_leaf=40,
                                           random_state=42))])
-5.148710540235394
{'classifier': GradientBoostingRegressor(random_state=42), 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__min_samples_leaf': 40, 'classifier__n_estimators': 100, 'scaler': 'passthrough'}


In [31]:
print(clf3.best_estimator_.named_steps['classifier'].feature_importances_)
print(X.columns)

[0.06282067 0.02590067 0.01202272 0.03550269 0.00720897 0.01821853
 0.08220218 0.0621866  0.18411932 0.44264033 0.02660307 0.04057425]
Index(['serves_breakfast', 'tasa_parados', 'dur_media_credito_viviendas',
       'poblacion_80_mas', 'poblacion_china', 'pct_crecimiento_demografico',
       'rating_mean', 'poblacion_italia', 'user_ratings_mean', 'price_level',
       'tipo_cocina_encoder', 'cod_barrio'],
      dtype='object')


In [32]:
best3 = clf3.best_estimator_
predictions_best3 = best3.predict(X_test)

print("MAE test", mean_absolute_error(y_test, predictions_best3))
print("MAPE test", mean_absolute_percentage_error(y_test, predictions_best3))
print("MSE test", mean_squared_error(y_test, predictions_best3))
print("RMSE test", mean_squared_error(y_test, predictions_best3)**(1/2))
print("R2 score", r2_score(y_test, predictions_best3))

MAE test 5.08366745233101
MAPE test 0.3542661315136399
MSE test 42.50152862781548
RMSE test 6.519319644549996
R2 score 0.29476070332635573


In [33]:
filename = '../models/3_gradient_boost_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best3, archivo_salida)

# Xboost

In [46]:
X = restaurantes.drop('y', axis=1)

y = restaurantes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('select_features', SelectKBest(score_func=f_regression)),
    ('classifier', xgboost.XGBRegressor())])

xboost_param = {
    'select_features__k':[50, 100, 120, 'all'],
    'classifier__learning_rate': [0.25, 0.75, 1],
    'classifier__max_depth': [4, 5, 6, 7],
    'classifier__min_child_weight': [4, 5, 7],
    'classifier__n_estimators':[100]
}


search_space = [
    xboost_param
]

clf4 = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10,
                  n_jobs=-1,
                  verbose=3)

clf4.fit(X_train, y_train)

print(clf4.best_estimator_)
print(clf4.best_score_)
print(clf4.best_params_)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Pipeline(steps=[('select_features',
                 SelectKBest(k='all',
                             score_func=<function f_regression at 0x0000027DE7DB5A80>)),
                ('classifier',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, device=None,
                              early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, f...ghts=None,
                              gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=0.25,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_

In [65]:
print(clf4.best_estimator_.named_steps['classifier'].feature_importances_)
print(X.columns)

[0.05448301 0.04342006 0.01170791 0.00535932 0.01161539 0.02621751
 0.00281791 0.01098142 0.00703253 0.00555892 0.00673646 0.00914079
 0.00870098 0.00543663 0.01187231 0.05595296 0.01417166 0.00716517
 0.01304254 0.02140538 0.01118424 0.00614812 0.02505677 0.00632774
 0.00535607 0.01041254 0.01382921 0.00360219 0.01458952 0.00770539
 0.00653169 0.0078293  0.01099748 0.00610852 0.00526752 0.01856276
 0.00613491 0.00725096 0.00597583 0.0113606  0.00802727 0.00595477
 0.00238882 0.00537988 0.00843489 0.00852711 0.01217137 0.00529872
 0.00974892 0.01013396 0.00709702 0.00807458 0.00355549 0.00583404
 0.00518821 0.00785551 0.0051204  0.0108516  0.01133472 0.0038774
 0.01197184 0.01556842 0.00725678 0.00860488 0.00561377 0.00824463
 0.00849354 0.00903673 0.00233407 0.02972311 0.00524215 0.00717178
 0.00767631 0.00337085 0.01091161 0.00608742 0.00813173 0.00788212
 0.00715408 0.00953597 0.00446072 0.00760627 0.00311008 0.01482668
 0.01091979 0.00317664 0.00803535 0.00884646 0.00622695 0.00498

In [67]:
best4 = clf4.best_estimator_
predictions_best4 = best4.predict(X_test)

print("MAE test", mean_absolute_error(y_test, predictions_best4))
print("MAPE test", mean_absolute_percentage_error(y_test, predictions_best4))
print("MSE test", mean_squared_error(y_test, predictions_best4))
print("RMSE test", mean_squared_error(y_test, predictions_best4)**(1/2))
print("R2 score", r2_score(y_test, predictions_best4))

MAE test 5.283738803446602
MAPE test 0.34878801669607584
MSE test 44.7541604799365
RMSE test 6.689855041773066
R2 score 0.257382177086366


In [53]:
filename = '../models/4_xboost_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best4, archivo_salida)

# SVM

In [None]:
X = restaurantes.drop(['y', 'cod_distrito', 'tipo_cocina_encoder', 'cod_barrio'], axis=1)
y = restaurantes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVR())])

SVR_param = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'classifier__kernel': ['linear', 'poly', 'rbf'],
    'classifier__gamma': ['scale', 'auto'],
    'classifier__degree': [2, 3, 4, 5],
    'classifier__C':[0.5, 1, 10, 50, 100],
    'classifier__max_iter': [1000000]
}


search_space = [
    SVR_param
]

clf5 = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10,
                  n_jobs=-1)

clf5.fit(X_train, y_train)

print(clf5.best_estimator_)
print(clf5.best_score_)
print(clf5.best_params_)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier', SVR(C=10, kernel='poly', max_iter=500000))])
-4.897241058747725
{'classifier__C': 10, 'classifier__degree': 3, 'classifier__gamma': 'scale', 'classifier__kernel': 'poly', 'classifier__max_iter': 500000, 'scaler': MinMaxScaler()}


In [104]:
best5 = clf5.best_estimator_
predictions_best5 = best5.predict(X_test)

print("MAE test", mean_absolute_error(y_test, predictions_best5))
print("MAPE test", mean_absolute_percentage_error(y_test, predictions_best5))
print("MSE test", mean_squared_error(y_test, predictions_best5))
print("RMSE test", mean_squared_error(y_test, predictions_best5)**(1/2))
print("R2 score", r2_score(y_test, predictions_best5))

MAE test 4.653434656363074
MAPE test 0.3266049534036448
MSE test 36.95694929247194
RMSE test 6.07922275397702
R2 score 0.3867633996305495


In [105]:
filename = '../models/final_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best5, archivo_salida)


# Catboost

In [85]:
X = restaurantes.drop(list(restaurantes.iloc[:, 57:172].columns)+['y'], axis=1)

y = restaurantes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),  
    ('rfe', RFE(estimator=LinearRegression(), n_features_to_select=30)),
    ('pca', PCA(n_components=15)),  
    ('classifier', CatBoostRegressor(random_state=42))
])


cat_grid = {
    'scaler':[MinMaxScaler(), StandardScaler(), 'passthrough'],
    'rfe__n_features_to_select': [20, 30, 40],
    'pca__n_components': [10, 15, 20],
    'classifier__depth': [4, 6],
    'classifier__learning_rate': [0.03, 0.1],
    'classifier__iterations': [100, 200]
}

search_space = [
    cat_grid
]

clf6 = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 3,
                  n_jobs=-1)

clf6.fit(X_train, y_train)

print(clf6.best_estimator_)
print(clf6.best_score_)
print(clf6.best_params_)

0:	learn: 7.8842772	total: 5.46ms	remaining: 541ms
1:	learn: 7.6956730	total: 9.7ms	remaining: 475ms
2:	learn: 7.5179772	total: 14.7ms	remaining: 474ms
3:	learn: 7.3795071	total: 19ms	remaining: 456ms
4:	learn: 7.2342170	total: 23.5ms	remaining: 446ms
5:	learn: 7.1223240	total: 27.7ms	remaining: 433ms
6:	learn: 7.0255973	total: 32.4ms	remaining: 431ms
7:	learn: 6.9328243	total: 36.7ms	remaining: 422ms
8:	learn: 6.8449508	total: 41.1ms	remaining: 415ms
9:	learn: 6.7760732	total: 45.5ms	remaining: 409ms
10:	learn: 6.7105825	total: 49.7ms	remaining: 402ms
11:	learn: 6.6435156	total: 53.7ms	remaining: 394ms
12:	learn: 6.5974678	total: 57.7ms	remaining: 386ms
13:	learn: 6.5562891	total: 62.8ms	remaining: 386ms
14:	learn: 6.5142969	total: 67.6ms	remaining: 383ms
15:	learn: 6.4761188	total: 72.3ms	remaining: 380ms
16:	learn: 6.4502704	total: 76.4ms	remaining: 373ms
17:	learn: 6.4026112	total: 80.7ms	remaining: 368ms
18:	learn: 6.3624132	total: 84.9ms	remaining: 362ms
19:	learn: 6.3370243	tota

In [86]:
best6 = clf6.best_estimator_
predictions_best6 = best6.predict(X_test)

print("MAE test", mean_absolute_error(y_test, predictions_best6))
print("MAPE test", mean_absolute_percentage_error(y_test, predictions_best6))
print("MSE test", mean_squared_error(y_test, predictions_best6))
print("RMSE test", mean_squared_error(y_test, predictions_best6)**(1/2))
print("R2 score", r2_score(y_test, predictions_best6))

MAE test 4.956151165587531
MAPE test 0.33489489288503793
MSE test 40.95015393064442
RMSE test 6.399230729599021
R2 score 0.3205030809686077


In [87]:
filename = '../models/6_catboost_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best3, archivo_salida)