# Modelo de predicción de restaurantes

Importamos las librerias necesarias para el modelo.
Obtenemos los datos de la base de datos de BigQuery y los cargamos en Pandas.

In [11]:
import os
from google.cloud import bigquery
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
def run_query(query):
  query_job = client.query(query)
  rows = query_job.to_dataframe()
  return rows

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/braided-grammar-430922-b4-64d668a5dc36.json"

project_id = "braided-grammar-430922"
client = bigquery.Client()

query = "select * from datanexus.metadata_google"
empresas = run_query(query)

query = f"select * from datanexus.google_yelp_reviews"
reviews = run_query(query)



Cambiamos el nombre de la columna `id` de `reviews` a `gmap_id`. Para tener compatibilidad entre las tablas.

In [2]:
reviews.rename(columns={'id': 'gmap_id'}, inplace=True)
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3529791 entries, 0 to 3529790
Data columns (total 7 columns):
 #   Column     Dtype              
---  ------     -----              
 0   gmap_id    object             
 1   review_id  object             
 2   user_id    object             
 3   text       object             
 4   rating     float64            
 5   date       datetime64[us, UTC]
 6   source     object             
dtypes: datetime64[us, UTC](1), float64(1), object(5)
memory usage: 188.5+ MB


Cambiamos el tipo de dato de city y mostramos la informacion.

In [None]:
empresas["city"].astype(str)
empresas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35068 entries, 0 to 35067
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            35068 non-null  object 
 1   address         35068 non-null  object 
 2   gmap_id         35068 non-null  object 
 3   description     35068 non-null  object 
 4   latitude        35068 non-null  float64
 5   longitude       35068 non-null  float64
 6   category        35068 non-null  object 
 7   avg_rating      35068 non-null  float64
 8   num_of_reviews  35068 non-null  Int64  
 9   hours           35068 non-null  object 
 10  MISC            35068 non-null  object 
 11  state           35068 non-null  object 
 12  postal_code     35068 non-null  object 
 13  city            35068 non-null  object 
dtypes: Int64(1), float64(3), object(10)
memory usage: 3.8+ MB


Normalizamos `city` y mostramos la informacion.

In [3]:
empresas["city"] = empresas["city"].apply(lambda x: x.lower().strip())
empresas["city"]

Unnamed: 0,city
0,new york
1,corpus christi
2,new york
3,new york
4,new york
...,...
35076,susanville
35077,kings beach
35078,tahoe city
35079,south lake tahoe


Preparamos el dataset para la creacion del modelo.

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Unir tablas
data = pd.merge(empresas, reviews, on='gmap_id')

# Agrupamos las reviews con las empresas a traves del gmap_id
rating_avg =data.groupby('gmap_id').agg(
    rating_avg=('rating', 'mean'),  # Calcular el promedio de rating
    text_combined=('text', lambda x: ' '.join(x))  # Concatenar todos los textos
).reset_index()

rating_avg.columns = ['gmap_id', 'rating', 'text']

# Unir rating promedio con datos de restaurantes
data = pd.merge(empresas, rating_avg, on='gmap_id')
data.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,hours,MISC,state,postal_code,city,rating,text
0,My Belly’s Playlist,218 W 35th St,0x89c2585735c12c2f:0xcef4b77c71663db,,40.751789,-73.991033,"['sandwich shop', 'delivery restaurant', 'new ...",4.0,88,"[['Tuesday', '11AM–7PM'], ['Wednesday', '11AM–...","{'Service options': ['Delivery', 'Takeout']}",NY,10001,new york,4.433962,Black bean burger is AMAZING. Love this place....
1,Quiznos,304 W 34th St,0x89c259ade5f4dcd3:0x529585dc34d884ff,"Known for toasted, build-your-own subs, this c...",40.752199,-73.994465,"['sandwich shop', 'caterer', 'deli', 'fast foo...",3.0,19,[],"{'Service options': ['Drive-through', 'Deliver...",NY,10001,new york,2.6,Made to order! Heavy on the oregano just the w...
2,'Wichcraft The Tunnel,269 11th Ave,0x89c2590e65f7d7c3:0x50934217a8e33749,Counter-serve spot for sandwiches boasting hig...,40.75188,-74.005134,"['sandwich shop', 'american restaurant', 'cate...",3.9,38,"[['Monday', '8AM–5PM'], ['Tuesday', '8AM–5PM']...","{'Service options': ['Drive-through', 'Takeout...",NY,10001,new york,4.0,Very good breakfast and lunch spot. (Translat...
3,Au Bon Pain,Pennsylvania Plaza 34th Street,0x89c259ae23258ca9:0x7ff87c6f69899002,"Counter-service chain cafe serving soups, sala...",40.750562,-73.993471,"['sandwich shop', 'cafe', 'caterer', 'takeout ...",3.5,38,"[['Sunday', '4AM–12AM'], ['Monday', '4AM–12AM'...","{'Service options': ['Delivery'], 'Amenities':...",NY,10001,new york,3.761905,The best of the best Great Food. Clean store. ...
4,City Gourmet Market,338 8th Ave,0x4065fcde1d91ff89:0x9b45afd93bddb6ce,,40.747677,-73.996556,"['restaurant or cafe', 'fast food restaurant',...",4.2,38,"[['Saturday', '6AM–10PM'], ['Sunday', '8AM–8PM...","{'Service options': ['Delivery', 'Takeout'], '...",NY,10001,new york,4.3,I refused the receipt and later found out I wa...


Generamos el modelo de predicción.

In [9]:
# Vectorización del texto
vectorizador = TfidfVectorizer()
tfidf_matrix = vectorizador.fit_transform(data['text'])

# Calcular similitud del coseno
similitud = cosine_similarity(tfidf_matrix)

# Exportar data
with open('data_restaurantes.pkl', 'wb') as f:
    pickle.dump(data, f)
    print("Datos de los restaurantes exportados.")

# Exportar
with open('similitud_restaurantes.pkl', 'wb') as f:
    pickle.dump(similitud, f)
    print("Modelo de prediccion exportado.")

Datos de los restaurantes exportados.
Modelo de prediccion exportado.


Probabamos la recomendación del modelo.

In [12]:
def recomendar_restaurantes(categoria, estado, ciudad):
    # Filtrar por estado y categoría
    filtrado = data[(data['state'] == estado.upper()) & (data['category'].str.contains(categoria, case=False)) & (data['city'] == ciudad.lower())]

    if filtrado.empty:
        return pd.DataFrame()

    # Recalcular similitud solo para el subconjunto filtrado
    filtrado_matrix = vectorizador.transform(filtrado['text'])
    similitud_filtrado = cosine_similarity(filtrado_matrix)

    # Obtener el índice de todos los restaurantes filtrados
    idx_similares = list(range(len(filtrado)))

    # Calcular el promedio de similitud para cada restaurante
    promedio_similitud = similitud_filtrado.mean(axis=1)

    # Añadir el promedio de similitud al DataFrame
    filtrado['similitud_promedio'] = promedio_similitud

    # Ordenar por similitud promedio
    recomendaciones = filtrado.sort_values(by='similitud_promedio', ascending=False)
    return recomendaciones[["name", "address", "rating"]]

# Ejemplo de recomendación
categoria_deseada = 'mexican'
estado_deseado = 'CA'
ciudad_deseado = 'Los angeles'
recomendaciones = recomendar_restaurantes(categoria_deseada, estado_deseado, ciudad_deseado)

recomendaciones.head()

Unnamed: 0,name,address,rating
5269,Taco Plus,1525 S Bundy Dr,4.110526
5279,Tacos Mexico Echo Park,1538 Glendale Blvd,4.076923
5273,Rodeo Mexican Grill (Echo Park),1721 Sunset Blvd,4.304348
5321,Rubio's,6081 Center Dr #216,3.818182
5224,Trujillo Family Restaurants,4052 S Central Ave,4.230769
