# Librerias

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost
from sklearn.svm import SVR

pd.options.display.max_columns = None

In [2]:
''' 
Lectura de datos de restaurantes de Madrid.
'''
restaurantes = pd.read_csv('../data/processed/restaurantes.csv')

In [3]:
for i in restaurantes.drop(['nombre_restaurante', 'place_id', 'direccion', 'tipo_cocina', 'y', 'rating', 'user_ratings_total'], axis=1).columns:
    print(f"'{i}',")


'lat',
'lon',
'dine_in',
'price_level',
'reservable',
'serves_beer',
'serves_breakfast',
'serves_brunch',
'serves_dinner',
'serves_lunch',
'serves_vegetarian_food',
'serves_wine',
'takeout',
'delivery',
'weelchair',
'hours_open',
'num_days_open',
'open_weekends',
'cod_distrito',
'cod_barrio',
'price_level_mean',
'rating_mean',
'user_ratings_mean',
'num_restaurantes',
'anio_medio_constr_vivendas',
'dur_media_credito_viviendas',
'edad_media_poblacion',
'num_locales_alta_abiertos',
'num_locales_alta_cerrados',
'poblacion_densidad',
'renta_media_persona',
'pct_crecimiento_demografico',
'valor_catast_inmueble_residen',
'Americana / Burgers',
'Asiática',
'China',
'Española',
'Fusión',
'Italiana',
'Japonesa',
'Latinoamericana',
'Mexicana',
'Otros',
'Arganzuela',
'Carabanchel',
'Centro',
'Chamartín',
'Chamberí',
'Ciudad Lineal',
'Fuencarral - El Pardo',
'Hortaleza',
'Latina',
'Moncloa - Aravaca',
'Moratalaz',
'Puente de Vallecas',
'Retiro',
'Salamanca',
'Tetuán',
'Usera',
'Abrantes',
'Acacias'

# Random Forest

In [21]:
X = restaurantes.drop(['nombre_restaurante', 'place_id', 'direccion', 'tipo_cocina', 'y', 'rating', 'user_ratings_total'], axis=1)

y = restaurantes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestRegressor(random_state=42))])

rf_params = {
    'scaler': [MinMaxScaler(), 'passthrough'],
    'classifier': [RandomForestRegressor(random_state=42)],
    'classifier__max_depth': [7, 10],
    'classifier__min_samples_leaf': [20, 30],
    'classifier__n_estimators':[100, 150],
    'classifier__bootstrap':[True]
}


search_space = [
    rf_params
]

clf = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10,
                  verbose=2)

clf.fit(X_train, y_train)

print(clf.best_estimator_)
print(clf.best_score_)
print(clf.best_params_)

Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV] END classifier=RandomForestRegressor(random_state=42), classifier__bootstrap=True, classifier__max_depth=7, classifier__min_samples_leaf=20, classifier__n_estimators=100, scaler=MinMaxScaler(); total time=   1.5s
[CV] END classifier=RandomForestRegressor(random_state=42), classifier__bootstrap=True, classifier__max_depth=7, classifier__min_samples_leaf=20, classifier__n_estimators=100, scaler=MinMaxScaler(); total time=   1.4s
[CV] END classifier=RandomForestRegressor(random_state=42), classifier__bootstrap=True, classifier__max_depth=7, classifier__min_samples_leaf=20, classifier__n_estimators=100, scaler=MinMaxScaler(); total time=   1.5s
[CV] END classifier=RandomForestRegressor(random_state=42), classifier__bootstrap=True, classifier__max_depth=7, classifier__min_samples_leaf=20, classifier__n_estimators=100, scaler=MinMaxScaler(); total time=   1.4s
[CV] END classifier=RandomForestRegressor(random_state=42), class

In [22]:
print(clf.best_estimator_.named_steps['classifier'].feature_importances_)
print(X.columns)

[1.85334800e-02 2.19722942e-02 0.00000000e+00 4.43278784e-01
 0.00000000e+00 0.00000000e+00 4.24206374e-02 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.38726118e-02 1.24395220e-03
 0.00000000e+00 2.64174880e-03 2.93583975e-03 2.68396161e-02
 7.47322645e-02 1.39376031e-02 3.76171067e-03 9.56361480e-03
 1.96139026e-02 2.55933337e-02 1.03968713e-01 1.86502267e-02
 7.12349095e-02 8.01158844e-03 1.51359538e-02 1.21239073e-02
 9.15932338e-03 9.46009710e-03 5.13077604e-03 9.95972525e-03
 6.37921622e-03 0.00000000e+00 0.00000000e+00 3.47521770e-04
 2.36074506e-03 0.00000000e+00 4.19074391e-04 0.00000000e+00
 0.00000000e+00 0.00000000e+00 4.54065473e-04 0.00000000e+00
 0.00000000e+00 1.50942300e-03 6.56414518e-04 0.00000000e+00
 0.00000000e+00 2.04655632e-04 0.00000000e+00 0.00000000e+00
 8.54574826e-05 0.00000000e+00 5.59726176e-05 0.00000000e+00
 3.32435203e-05 2.76873365e-04 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.000000

In [23]:
best1 = clf.best_estimator_
predictions_best1 = best1.predict(X_test)

print("MAE test", mean_absolute_error(y_test, predictions_best1))
print("MAPE test", mean_absolute_percentage_error(y_test, predictions_best1))
print("MSE test", mean_squared_error(y_test, predictions_best1))
print("RMSE test", mean_squared_error(y_test, predictions_best1)**(1/2))
print("R2 score", r2_score(y_test, predictions_best1))

MAE test 4.871883381434369
MAPE test 0.3531407898336333
MSE test 41.34774008704333
RMSE test 6.430220842789408
R2 score 0.3677230112330856


In [24]:
filename = '../models/1_randomforest_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best1, archivo_salida)

# Reg Lineales

In [32]:
X = restaurantes[['serves_breakfast', 
                  'parados', 
                  'dur_media_credito_viviendas', 
                  'poblacion_80_mas',
                  'poblacion_china',
                  'pct_crecimiento_demografico',
                  'rating_mean',
                  'poblacion_italia',
                  'user_ratings_mean',
                  'price_level']]

y = restaurantes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('poly', PolynomialFeatures()),
    ('scaler', StandardScaler()),
    ('classifier', LinearRegression())])

linear_params = {
    'poly__degree':[1],
    'scaler':[MinMaxScaler(), StandardScaler()],
    'classifier': [LinearRegression()]
}

regularizacion_params = {
    'poly__degree':[1, 2, 3, 4, 5],
    'scaler': [MinMaxScaler(), StandardScaler()],
    'classifier': [Ridge(), Lasso()],
    'classifier__alpha': [0.25, 0,75, 0.80, 0.90, 1, 100]
}

elastic_param = {
    'poly__degree':[1, 2, 3, 4, 5],
    'scaler': [MinMaxScaler(), StandardScaler()],
    'classifier': [ElasticNet()],
    'classifier__alpha': [0.25, 0,75, 0.80, 0.90, 1, 100],
    'classifier__l1_ratio': [0.1, 0.25, 0.50, 0.75, 0.80, 1]
}

search_space = [
    linear_params,
    regularizacion_params,
    elastic_param
]

clf2 = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10,
                  n_jobs=-1,
                  verbose=2)

clf2.fit(X_train, y_train)

print(clf2.best_estimator_)
print(clf2.best_score_)
print(clf2.best_params_)

Fitting 10 folds for each of 562 candidates, totalling 5620 fits
Pipeline(steps=[('poly', PolynomialFeatures()), ('scaler', MinMaxScaler()),
                ('classifier', Ridge(alpha=0.25))])
-5.081940554111285
{'classifier': Ridge(), 'classifier__alpha': 0.25, 'poly__degree': 2, 'scaler': MinMaxScaler()}


In [34]:
best2 = clf2.best_estimator_
predictions_best2 = best2.predict(X_test)

print("MAE test", mean_absolute_error(y_test, predictions_best2))
print("MAPE test", mean_absolute_percentage_error(y_test, predictions_best2))
print("MSE test", mean_squared_error(y_test, predictions_best2))
print("RMSE test", mean_squared_error(y_test, predictions_best2)**(1/2))
print("R2 score", r2_score(y_test, predictions_best2))

MAE test 5.209969983609404
MAPE test 0.41869564397943093
MSE test 44.72980831554107
RMSE test 6.688034712495224
R2 score 0.33632876074056617


In [35]:
filename = '../models/2_ridge_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best2, archivo_salida)

# Gradient Boosting

In [39]:
X = restaurantes[['serves_breakfast', 
                  'parados', 
                  'dur_media_credito_viviendas', 
                  'poblacion_80_mas',
                  'poblacion_china',
                  'pct_crecimiento_demografico',
                  'rating_mean',
                  'poblacion_italia',
                  'user_ratings_mean',
                  'price_level',
                  'tipo_cocina_encoder',
                  'cod_barrio'
                  ]]

y = restaurantes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GradientBoostingRegressor(random_state=42))])

gboost_param = {
    'scaler': [StandardScaler(), 'passthrough'],
    'classifier': [GradientBoostingRegressor(random_state=42)],
    'classifier__learning_rate': [0.25, 0.3, 0.5],
    'classifier__max_depth': [3, 4],
    'classifier__min_samples_leaf': [20, 30, 40],
    'classifier__n_estimators':[100]
}


search_space = [
    gboost_param
]

clf3 = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10,
                  n_jobs=-1,
                  verbose=3)

clf3.fit(X_train, y_train)

print(clf3.best_estimator_)
print(clf3.best_score_)
print(clf3.best_params_)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 GradientBoostingRegressor(learning_rate=0.25,
                                           min_samples_leaf=30,
                                           random_state=42))])
-5.1158186135385435
{'classifier': GradientBoostingRegressor(random_state=42), 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__min_samples_leaf': 30, 'classifier__n_estimators': 100, 'scaler': StandardScaler()}


In [44]:
print(clf3.best_estimator_.named_steps['classifier'].feature_importances_)
print(X.columns)

[0.07065508 0.02601208 0.01502961 0.02211229 0.00401168 0.02467589
 0.1099545  0.03062598 0.22621286 0.42669326 0.02886445 0.01515232]
Index(['serves_breakfast', 'parados', 'dur_media_credito_viviendas',
       'poblacion_80_mas', 'poblacion_china', 'pct_crecimiento_demografico',
       'rating_mean', 'poblacion_italia', 'user_ratings_mean', 'price_level',
       'tipo_cocina_encoder', 'cod_barrio'],
      dtype='object')


In [40]:
best3 = clf3.best_estimator_
predictions_best3 = best3.predict(X_test)

print("MAE test", mean_absolute_error(y_test, predictions_best3))
print("MAPE test", mean_absolute_percentage_error(y_test, predictions_best3))
print("MSE test", mean_squared_error(y_test, predictions_best3))
print("RMSE test", mean_squared_error(y_test, predictions_best3)**(1/2))
print("R2 score", r2_score(y_test, predictions_best3))

MAE test 5.27140560689094
MAPE test 0.4081054046113165
MSE test 46.29318811759027
RMSE test 6.803909767008251
R2 score 0.31313236778176456


In [41]:
filename = '../models/3_gradient_boost_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best3, archivo_salida)

# Xboost

In [48]:
X = restaurantes[['serves_breakfast', 
                  'parados', 
                  'dur_media_credito_viviendas', 
                  'poblacion_80_mas',
                  'poblacion_china',
                  'pct_crecimiento_demografico',
                  'rating_mean',
                  'poblacion_italia',
                  'user_ratings_mean',
                  'price_level',
                  'tipo_cocina_encoder',
                  'cod_barrio'
                  ]]

y = restaurantes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', xgboost.XGBRegressor())])

xboost_param = {
    'scaler': [MinMaxScaler(), StandardScaler(), 'passthrough'],
    'classifier': [xgboost.XGBRegressor()],
    'classifier__learning_rate': [0.25, 0.75, 1],
    'classifier__max_depth': [4, 5, 6, 7],
    'classifier__min_child_weight': [4, 5, 7],
    'classifier__n_estimators':[100]
}


search_space = [
    xboost_param
]

clf4 = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10,
                  n_jobs=-1,
                  verbose=3)

clf4.fit(X_train, y_train)

print(clf4.best_estimator_)
print(clf4.best_score_)
print(clf4.best_params_)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, device=None,
                              early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, feature_weights=None,
                              gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=0.25,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=4, max_leaves=None, min_child_weight=5,
                     

In [49]:
print(clf4.best_estimator_.named_steps['classifier'].feature_importances_)
print(X.columns)

[0.11709094 0.02775789 0.02901516 0.02960814 0.0353903  0.03775423
 0.02490809 0.03277757 0.04551588 0.5713765  0.02802337 0.02078198]
Index(['serves_breakfast', 'parados', 'dur_media_credito_viviendas',
       'poblacion_80_mas', 'poblacion_china', 'pct_crecimiento_demografico',
       'rating_mean', 'poblacion_italia', 'user_ratings_mean', 'price_level',
       'tipo_cocina_encoder', 'cod_barrio'],
      dtype='object')


In [50]:
best4 = clf4.best_estimator_
predictions_best4 = best4.predict(X_test)

print("MAE test", mean_absolute_error(y_test, predictions_best4))
print("MAPE test", mean_absolute_percentage_error(y_test, predictions_best4))
print("MSE test", mean_squared_error(y_test, predictions_best4))
print("RMSE test", mean_squared_error(y_test, predictions_best4)**(1/2))
print("R2 score", r2_score(y_test, predictions_best4))

MAE test 5.321078056766567
MAPE test 0.3989015305796611
MSE test 48.150308346763005
RMSE test 6.939042322018436
R2 score 0.2855776491195654


In [51]:
filename = '../models/4_xboost_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best4, archivo_salida)

# SVM

In [52]:
X = restaurantes[['serves_breakfast', 
                  'parados', 
                  'dur_media_credito_viviendas', 
                  'poblacion_80_mas',
                  'poblacion_china',
                  'pct_crecimiento_demografico',
                  'rating_mean',
                  'poblacion_italia',
                  'user_ratings_mean',
                  'price_level',
                  'tipo_cocina_encoder',
                  'cod_barrio'
                  ]]

y = restaurantes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVR())])

SVR_param = {
    'scaler': [MinMaxScaler(), StandardScaler(), 'passthrough'],
    'classifier': [SVR()],
    'classifier__kernel': ['linear', 'poly', 'rbf'],
    'classifier__gamma': ['scale', 'auto'],
    'classifier__degree': [2, 3, 4, 5],
    'classifier__C':[0.5, 1, 10, 50, 100],
    'classifier__max_iter': [10000, 50000]
}


search_space = [
    SVR_param
]

clf5 = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  scoring='neg_mean_absolute_error',
                  cv = 10,
                  n_jobs=-1,
                  verbose=3)

clf5.fit(X_train, y_train)

print(clf5.best_estimator_)
print(clf5.best_score_)
print(clf5.best_params_)

Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier',
                 SVR(C=10, degree=2, kernel='poly', max_iter=50000))])
-5.017888992178857
{'classifier': SVR(), 'classifier__C': 10, 'classifier__degree': 2, 'classifier__gamma': 'scale', 'classifier__kernel': 'poly', 'classifier__max_iter': 50000, 'scaler': MinMaxScaler()}


In [54]:
best5 = clf5.best_estimator_
predictions_best5 = best5.predict(X_test)

print("MAE test", mean_absolute_error(y_test, predictions_best5))
print("MAPE test", mean_absolute_percentage_error(y_test, predictions_best5))
print("MSE test", mean_squared_error(y_test, predictions_best5))
print("RMSE test", mean_squared_error(y_test, predictions_best5)**(1/2))
print("R2 score", r2_score(y_test, predictions_best5))

MAE test 5.107285093006351
MAPE test 0.43212798050297707
MSE test 45.17899883149742
RMSE test 6.7215324764146915
R2 score 0.3296639696847825


In [55]:
filename = '../models/5_srv_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best5, archivo_salida)
