# Librerias

In [1464]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utils as u
import re
import geopandas as gpd
import ast

pd.options.display.max_columns = None

# Limpieza de Datos

In [1465]:
restautantes = pd.read_csv('../data/raw/lugares_madrid.csv')
detalle_restaurantes = pd.read_csv('../data/raw/detalle_sitios.csv')
restautantes = pd.merge(restautantes, detalle_restaurantes, left_on='id', right_on='place_id')

In [1466]:
restautantes.drop(['id', 'summary'], inplace=True, axis=1)
restautantes = restautantes[restautantes['rating'].notnull()]

In [1467]:
restautantes['dine_in'] = restautantes['dine_in'].astype(bool).fillna(True)
restautantes['price_level'] = restautantes['price_level'].fillna(1)
restautantes['reservable'] = restautantes['reservable'].astype(bool).fillna(False)
restautantes['serves_beer'] = restautantes['serves_beer'].astype(bool).fillna(True)
restautantes['serves_breakfast'] = restautantes['serves_breakfast'].astype(bool).fillna(False)
restautantes['serves_brunch'] = restautantes['serves_brunch'].astype(bool).fillna(False)
restautantes['serves_dinner'] = restautantes['serves_dinner'].astype(bool).fillna(True)
restautantes['serves_lunch'] = restautantes['serves_lunch'].astype(bool).fillna(True)
restautantes['serves_vegetarian_food'] = restautantes['serves_vegetarian_food'].astype(bool).fillna(False)
restautantes['serves_wine'] = restautantes['serves_wine'].astype(bool).fillna(True)
restautantes['takeout'] = restautantes['takeout'].astype(bool).fillna(False)
restautantes['delivery'] = restautantes['delivery'].astype(bool).fillna(False)

In [1468]:
restautantes_geo = gpd.GeoDataFrame(restautantes, geometry=gpd.points_from_xy(restautantes['lon'], restautantes['lat']), crs='EPSG:4326')

In [1469]:
barrios = gpd.read_file('../data/raw/Barrios.json')
barrios = gpd.GeoDataFrame(barrios, geometry='geometry', crs='EPSG:4326')

In [1470]:
restautantes_geo = gpd.sjoin(restautantes_geo, barrios, how='left', predicate='intersects')
restautantes_geo.drop(['index_right', 'id', 'Shape_Leng', 'Shape_Area','FCH_ALTA', 
                       'FCH_BAJA', 'OBSERVACIO', 'APROBACION', 'COD_DIS_TX', 'COD_DISB',
                       'BARRIO_MAY','BARRIO_MT', 'COD_DISBAR', 'NUM_BAR'], axis=1 , inplace=True)

In [1471]:
restautantes_geo = restautantes_geo.to_crs(epsg=25830) #cambiar lat y log a seudonimo en distancia en metros para España

In [1472]:
restautantes_geo_buffer = restautantes_geo.copy()
restautantes_geo_buffer['buffer_2000'] = restautantes_geo_buffer.geometry.buffer(2000)
restautantes_geo_buffer = restautantes_geo_buffer.set_geometry('buffer_2000')

In [1473]:
restautantes_geo.drop(['nombre','lat', 'lon', 'dine_in', 'address', 'reservable', 'serves_beer', 'serves_breakfast',
       'serves_brunch', 'serves_dinner', 'serves_lunch',
       'serves_vegetarian_food', 'serves_wine', 'takeout', 'delivery',
       'CODDIS', 'NOMDIS','COD_BAR', 'NOMBRE'], axis=1, inplace=True)

In [1474]:
result_restaurantes = gpd.sjoin(restautantes_geo, restautantes_geo_buffer, how='right', predicate='intersects')
result_restaurantes.drop('index_left', axis=1, inplace=True)

In [1475]:
result = result_restaurantes.groupby(['place_id_right','CODDIS', 'NOMDIS','COD_BAR', 'NOMBRE'])[['price_level_left', 'rating_left', 'user_ratings_total_left']].mean().reset_index()
result.rename({'place_id_right':'place_id',
               'price_level_left':'price_level_mean',
               'rating_left':'rating_mean',
               'user_ratings_total_left':'user_ratings_mean'}, axis=1, inplace=True)

In [1476]:
restautantes = pd.merge(restautantes, result, how='left', left_on='place_id', right_on='place_id')

In [1477]:
restautantes['COD_BAR'] = restautantes['COD_BAR'].astype('int')
restautantes['COD_BAR'] = restautantes['COD_BAR'].astype('str')

In [1478]:
kpi = pd.read_csv('../data/raw/kpi_barrios_madrid.csv')

In [1479]:
kpi['cod_barrio'] = kpi['cod_barrio'].astype('int')
kpi['cod_barrio'] = kpi['cod_barrio'].astype('str')

In [1480]:
kpi = kpi.pivot(index='cod_barrio', columns='indicador_completo', values='valor_indicador').reset_index()
kpi.fillna(0, inplace=True)

In [1481]:
restautantes = pd.merge(restautantes, kpi, how='left', left_on='COD_BAR', right_on='cod_barrio')

In [1482]:
street_map = pd.read_csv('../data/raw/sitios_streetmap.csv')

In [1483]:
street_map.drop('wheelchair', axis=1, inplace=True)
street_map = street_map[(street_map['cocina'].notnull())&(street_map['nombre'].notnull())]

In [1484]:
street_map['cocina'] = [re.sub(r'[^a-zA-Z0-9\s]', ' ', i.lower()) for i in street_map['cocina']]

In [1485]:
with open('../data/raw/tipo_cocina.txt', 'r', encoding="utf-8") as file:
    data = file.read()

tipos_cocina = ast.literal_eval(data)

In [1486]:
street_map['cocina_map'] = street_map['cocina'].map(tipos_cocina)
street_map = street_map[street_map['cocina_map'].notnull()]

In [1487]:
street_map['nombre'] = street_map['nombre'].apply(u.eliminar_acentos)
street_map['nombre'] = [re.sub(r'[^a-zA-Z0-9\s]', ' ', i.lower()) for i in street_map['nombre']]

In [1488]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

X = street_map['nombre']
y = street_map['cocina_map']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=42)

pipe = Pipeline(steps=[("scaler", CountVectorizer()),
    ('classifier', MultinomialNB())
])

logistic_params = {
    'scaler':[CountVectorizer()],
    'classifier': [LogisticRegression(max_iter=10000, solver='liblinear'), LogisticRegression(max_iter=10, solver='liblinear')],
    'classifier__penalty': ['l1', 'l2']
}

random_forest_params = {
    'scaler': [CountVectorizer()],
    'classifier': [RandomForestClassifier()],
    'classifier__max_depth': np.arange(2, 9),
    'classifier__n_estimators': [100, 200, 500],
}

naive_param = {
    'scaler': [CountVectorizer()],
    'classifier': [MultinomialNB()],
    'classifier__alpha': [0.001, 0.1, 0.25, 0.5, 0.75, 1],
}

search_space = [
    logistic_params,
    random_forest_params,
    naive_param
]

clf = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 5,
                  n_jobs=-1)

clf.fit(X_train, y_train)

In [1489]:
print(clf.best_estimator_)
print(clf.best_score_)
print(clf.best_params_)

Pipeline(steps=[('scaler', CountVectorizer()),
                ('classifier', MultinomialNB(alpha=0.25))])
0.734637434241119
{'classifier': MultinomialNB(), 'classifier__alpha': 0.25, 'scaler': CountVectorizer()}


In [1490]:
cocina_mod = clf.best_estimator_
y_pred = cocina_mod.predict(X_test)

cocina_mod.score(y_test, y_pred)

0.7666999002991027

In [1491]:
restautantes['nombre'] = restautantes['nombre'].apply(u.eliminar_acentos)
restautantes['nombre'] = [re.sub(r'[^a-zA-Z0-9\s]', ' ', i.lower()) for i in restautantes['nombre']]

In [1492]:
restautantes['tipo_cocina']  = cocina_mod.predict(restautantes['nombre'])