## Aqui se crea el archivo DF_jaccard.csv el cual posee una matriz de recomendacion.

In [100]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform

In [101]:
# Cargamos las categorias a un df
nombre_archivo = 'datos\\categorias.csv'
dfcategorias = pd.read_csv(nombre_archivo)
dfcategorias=dfcategorias.drop(['Unnamed: 0'],axis=1)

# Cargamos los restaurantes a un df
nombre_archivo = 'datos\\restaurantes.csv'
dfrestaurantes = pd.read_csv(nombre_archivo)
dfrestaurantes=dfrestaurantes.drop(['Unnamed: 0'],axis=1)

# Cargamos la tabla intermedia
nombre_archivo = 'datos\\categoriasrestaurantes.csv'
dfcategoriasrestaurantes = pd.read_csv(nombre_archivo)
dfcategoriasrestaurantes=dfcategoriasrestaurantes.drop(['Unnamed: 0'],axis=1)

In [102]:
# Unimos las categorias con la tabla intermedia y eliminamos columnas que no sirven para el analisis
dfagrupado = pd.merge(dfcategoriasrestaurantes, dfcategorias, left_on= 'id_categoria', right_on= 'id_categoria', how= 'left')
dfagrupado = dfagrupado.drop(['id_categoria_restaurante', 'id_categoria'], axis=1)

# Unimos ahora las ciudades gracias a la tabla intermedia
dfagrupado = pd.merge(dfagrupado, dfrestaurantes, left_on='id_restaurante', right_on='id_restaurante', how='left')
dfagrupado = dfagrupado.drop(['identificador_yelp', 'identificador_google', 'direccion', 'latitud', 'longitud'], axis=1)

dfagrupado

Unnamed: 0,id_restaurante,categoria,id_ciudad,nombre,avg_rating,reviews_count,estado
0,663,aa shop,275,Brick & Iron Cafe,5.0,44,Normally open
1,926,aa shop,83,GoJuice,4.9,13,Normally open
2,1465,aa shop,258,The Boost Juice Bar & Kitchen,4.4,58,Permanently closed
3,3029,aa shop,382,Sunset Island,4.9,28,Normally open
4,3702,aa shop,286,Shape Up Ormond Beach,4.8,38,Normally open
...,...,...,...,...,...,...,...
95421,11427,yoga studio,37,Village of the Arts,4.1,28,Normally open
95422,13143,yoga studio,247,Enso Life Center,5.0,14,Permanently closed
95423,14178,yoga studio,337,ionie Retreat and Organic Raw Food Cafe,4.9,88,Normally open
95424,14917,yoga studio,101,Fusion Studios,4.9,18,Normally open


In [103]:
# Dejamos en el df solo las ciudades que utilizaremos en el sistema de recomendacion
dfagrupado = dfagrupado[dfagrupado['id_ciudad'].isin([2, 32, 37, 242, 243, 299, 332, 361, 393])]

In [104]:
# Usamos la funcion crosstab de pandas para transformar los datos
tabla_cross = pd.crosstab(dfagrupado['nombre'], dfagrupado['categoria'])
tabla_cross

categoria,aa shop,acai bowls,active life,addiction treatment center,adult,adult entertainment club,afghani restaurant,african restaurant,after school program,airline ticket agency,...,wine spirits,wine bar,wine bars,wine store,womens clothing store,wrap,yacht club,yelp events,yoga,yoga studio
nombre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1 Beach Club,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1-800-Lucky,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13 American Table,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
134 LATIN FOOD,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
taqueria la mexicana,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
the CAFÉ @ ARTpool Gallery,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ti bamboo caribbean restaurant and lounge,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
uGOgelato,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
# Imprimimos el coeficiente jaccard entre 2 restaurantes
juego1=tabla_cross.loc['134 LATIN FOOD']
juego2=tabla_cross.loc['taqueria la mexicana']
print(jaccard_score(juego1,juego2, average= 'micro'))
print(jaccard_score(juego1,juego2, average= 'macro'))
print(jaccard_score(juego1,juego2, average= 'weighted'))

0.9912663755458515
0.49780701754385964
0.99270288293834


In [106]:
# Calculamos de la distancia jaccard
dist_jaccard = pdist(tabla_cross.values, metric='jaccard')
dist_jaccard

array([1., 1., 1., ..., 1., 1., 1.])

In [107]:
# Transformamos a matriz rectangular
matriz_cuadrada_dist = squareform(dist_jaccard)
matriz_cuadrada_dist

array([[0. , 1. , 1. , ..., 1. , 1. , 1. ],
       [1. , 0. , 1. , ..., 0.5, 1. , 1. ],
       [1. , 1. , 0. , ..., 1. , 1. , 1. ],
       ...,
       [1. , 0.5, 1. , ..., 0. , 1. , 1. ],
       [1. , 1. , 1. , ..., 1. , 0. , 1. ],
       [1. , 1. , 1. , ..., 1. , 1. , 0. ]])

In [108]:
# Restamos a 1 los valores obtenidos para tener el coeficiente de similitud
coef_similitud_jaccard = 1- matriz_cuadrada_dist
coef_similitud_jaccard

array([[1. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 1. , 0. , ..., 0.5, 0. , 0. ],
       [0. , 0. , 1. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0.5, 0. , ..., 1. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 1. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 1. ]])

In [109]:
# Creamos un nuevo dataframe
df_jaccard = pd.DataFrame(coef_similitud_jaccard, index=tabla_cross.index, columns=tabla_cross.index)
df_jaccard

nombre,$,1 Beach Club,1-800-Lucky,13 American Table,134 LATIN FOOD,180 Degrees At the DRB,180 Grill and Bar,1821 Sandwich Shop,1826 Restaurant & Lounge,1888 Fiaschetteria Miami,...,la Padella,ms.benedict,nomad@sean rush,sweetgreen,taqueria la cañada,taqueria la mexicana,the CAFÉ @ ARTpool Gallery,ti bamboo caribbean restaurant and lounge,uGOgelato,¡Ole!
nombre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$,1.0,0.000000,0.0,0.000000,0.0,0.5,0.000000,0.000000,0.500000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
1 Beach Club,0.0,1.000000,0.0,0.100000,0.0,0.0,0.200000,0.500000,0.333333,0.0,...,0.111111,0.500000,0.166667,0.0,0.166667,0.0,0.333333,0.500000,0.0,0.0
1-800-Lucky,0.0,0.000000,1.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
13 American Table,0.0,0.100000,0.0,1.000000,0.0,0.0,0.083333,0.111111,0.111111,0.0,...,0.133333,0.111111,0.166667,0.0,0.076923,0.0,0.100000,0.111111,0.0,0.0
134 LATIN FOOD,0.0,0.000000,0.0,0.000000,1.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
taqueria la mexicana,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,1.0,0.000000,0.000000,0.0,0.0
the CAFÉ @ ARTpool Gallery,0.0,0.333333,0.0,0.100000,0.0,0.0,0.200000,0.500000,0.333333,0.0,...,0.111111,0.500000,0.166667,0.0,0.166667,0.0,1.000000,0.500000,0.0,0.0
ti bamboo caribbean restaurant and lounge,0.0,0.500000,0.0,0.111111,0.0,0.0,0.250000,1.000000,0.500000,0.0,...,0.125000,1.000000,0.200000,0.0,0.200000,0.0,0.500000,1.000000,0.0,0.0
uGOgelato,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,1.0,0.0


In [110]:
# Guardamo el dataframe creado que servira como sistema de recomendacion
df_jaccard.to_csv('df_jaccard.csv', sep=';', index_label='nombre')

In [111]:
# Levantamos el dataframe creado
jaccard_csv=pd.read_csv('df_jaccard.csv', sep=';', index_col='nombre')
jaccard_csv

Unnamed: 0_level_0,$,1 Beach Club,1-800-Lucky,13 American Table,134 LATIN FOOD,180 Degrees At the DRB,180 Grill and Bar,1821 Sandwich Shop,1826 Restaurant & Lounge,1888 Fiaschetteria Miami,...,la Padella,ms.benedict,nomad@sean rush,sweetgreen,taqueria la cañada,taqueria la mexicana,the CAFÉ @ ARTpool Gallery,ti bamboo caribbean restaurant and lounge,uGOgelato,¡Ole!
nombre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$,1.0,0.000000,0.0,0.000000,0.0,0.5,0.000000,0.000000,0.500000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
1 Beach Club,0.0,1.000000,0.0,0.100000,0.0,0.0,0.200000,0.500000,0.333333,0.0,...,0.111111,0.500000,0.166667,0.0,0.166667,0.0,0.333333,0.500000,0.0,0.0
1-800-Lucky,0.0,0.000000,1.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
13 American Table,0.0,0.100000,0.0,1.000000,0.0,0.0,0.083333,0.111111,0.111111,0.0,...,0.133333,0.111111,0.166667,0.0,0.076923,0.0,0.100000,0.111111,0.0,0.0
134 LATIN FOOD,0.0,0.000000,0.0,0.000000,1.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
taqueria la mexicana,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,1.0,0.000000,0.000000,0.0,0.0
the CAFÉ @ ARTpool Gallery,0.0,0.333333,0.0,0.100000,0.0,0.0,0.200000,0.500000,0.333333,0.0,...,0.111111,0.500000,0.166667,0.0,0.166667,0.0,1.000000,0.500000,0.0,0.0
ti bamboo caribbean restaurant and lounge,0.0,0.500000,0.0,0.111111,0.0,0.0,0.250000,1.000000,0.500000,0.0,...,0.125000,1.000000,0.200000,0.0,0.200000,0.0,0.500000,1.000000,0.0,0.0
uGOgelato,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,1.0,0.0


In [112]:
# Probamos el modelo
recomendaciones_prueba=jaccard_csv['taqueria la mexicana'].sort_values(ascending=False)
recomendaciones_prueba = pd.DataFrame(recomendaciones_prueba)
recomendaciones_prueba


Unnamed: 0_level_0,taqueria la mexicana
nombre,Unnamed: 1_level_1
Taqueria El Molcajete,1.0
Twice butter,1.0
La Playa De Miami,1.0
Taqueria La Bendicion Taco Truck,1.0
Taqueria los Jalapenos,1.0
...,...
Friends Coffee LLC,0.0
Fritanga Coliseo,0.0
Fritanga Cosas Nuevas,0.0
Fritanga El Campesino,0.0


In [120]:
# Creamos la funcion
def recomendacion_juego(titulo: str) -> list:

    juego = dfagrupado[dfagrupado['nombre'].str.lower().str.contains(titulo, case=False)]
    juego = juego.iloc[0,3]

    recomendaciones=jaccard_csv[juego].sort_values(ascending=False)
    return list(recomendaciones.index[0:10].tolist())

In [121]:
recomendacion_juego('hot dog')

['Nah Dogs Vegan Hot Dog Cart',
 'Fire Monkey Food Truck',
 'Seabreeze by the Bay',
 'taqueria la cañada',
 'Taco Bus',
 'Red’s BBQ Foodtruck',
 "Lucy's Vegan Corner",
 'Hokulia Shave Ice',
 'Main Street Boys',
 "Papa Murphy's"]