In [1]:
import os
import json
import pandas as pd
import numpy as np
import warnings
import reverse_geocode
warnings.filterwarnings('ignore')

# ETL
La idea en este notebook es realizar las transformaciones correspondientes a los datos de Google Maps, tanto la metadata como los reviews. 

En este caso el alcance se limita a los 5 estados con mayor población California, Nueva York, Texas, Pensilvania y Florida. En cuanto a lo estudiado va a ser la categoria restaurantes, incluyendo todo tipo de los mismos.

In [2]:
#Generamos un diccionario con las carpetas de los archivos
ruta_review_estados={
    'New_York': 'Datasets/Google Maps/reviews-estados/review-New_York',
    'California': 'Datasets/Google Maps/reviews-estados/review-California',
    'Texas': 'Datasets/Google Maps/reviews-estados/review-Texas',
    'Florida': 'Datasets/Google Maps/reviews-estados/review-Florida',
    'Pennsylvania': 'Datasets/Google Maps/reviews-estados/review-Pennsylvania',

}

In [8]:
#Creamos un diccionario vacio para almacenar las claves de los reviews para poder filtrar los datos de metadata
gmaps_ids = {}
#Recorremos todas las rutas para cargar los datos de todos los estados
for estado, ruta_estado in ruta_review_estados.items():
#Se utiliza set para evitar duplicados
    gmaps_ids[estado] =set()
    #Se recorren todos los json de las carpetas de los reviews de los estados
    for archivo in os.listdir(ruta_estado):
        ruta_archivo=os.path.join(ruta_estado, archivo)
        print(ruta_archivo)
        with open(ruta_archivo,'r') as file:
            for linea in file:
                data = json.loads(linea)
                gmaps_ids[estado].add(data['gmap_id'])


Datasets/Google Maps/reviews-estados/review-New_York\1.json
Datasets/Google Maps/reviews-estados/review-New_York\10.json
Datasets/Google Maps/reviews-estados/review-New_York\11.json
Datasets/Google Maps/reviews-estados/review-New_York\12.json
Datasets/Google Maps/reviews-estados/review-New_York\13.json
Datasets/Google Maps/reviews-estados/review-New_York\14.json
Datasets/Google Maps/reviews-estados/review-New_York\15.json
Datasets/Google Maps/reviews-estados/review-New_York\16.json
Datasets/Google Maps/reviews-estados/review-New_York\17.json
Datasets/Google Maps/reviews-estados/review-New_York\18.json
Datasets/Google Maps/reviews-estados/review-New_York\2.json
Datasets/Google Maps/reviews-estados/review-New_York\3.json
Datasets/Google Maps/reviews-estados/review-New_York\4.json
Datasets/Google Maps/reviews-estados/review-New_York\5.json
Datasets/Google Maps/reviews-estados/review-New_York\6.json
Datasets/Google Maps/reviews-estados/review-New_York\7.json
Datasets/Google Maps/reviews-es

In [9]:
#Ruta de la carpeta metadata
ruta_metadata = 'Datasets/Google Maps/metadata-sitios'
#Diccionario para almacenar los datos de metadata
metadata = {}
#Recorremos todos los archivos de la carpeta metadata
for archivo in os.listdir(ruta_metadata):
    ruta_archivo = os.path.join(ruta_metadata, archivo)
    with open(ruta_archivo, 'r',encoding='ISO-8859-1') as file:
        for linea in file:
            data = json.loads(linea)
            #Si el gmap_id esta en el diccionario de gmaps_ids se guarda
            for estado in gmaps_ids.keys():
                if data['gmap_id'] in gmaps_ids[estado]:
                    metadata[data['gmap_id']] = data
                    #Ademas se agrega una columna con el estado al que pertenece
                    metadata[data['gmap_id']]['estado'] = estado

In [10]:
df_metadata = pd.DataFrame(metadata).T

In [11]:
df_metadata.reset_index(inplace=True)

In [12]:
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319881 entries, 0 to 319880
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   index             319881 non-null  object
 1   name              319879 non-null  object
 2   address           317103 non-null  object
 3   gmap_id           319881 non-null  object
 4   description       44767 non-null   object
 5   latitude          319881 non-null  object
 6   longitude         319881 non-null  object
 7   category          319632 non-null  object
 8   avg_rating        319881 non-null  object
 9   num_of_reviews    319881 non-null  object
 10  price             47766 non-null   object
 11  hours             276487 non-null  object
 12  MISC              286245 non-null  object
 13  state             279719 non-null  object
 14  relative_results  304211 non-null  object
 15  url               319881 non-null  object
 16  estado            319881 non-null  obj

In [13]:
df_metadata.isnull().sum()

index                    0
name                     2
address               2778
gmap_id                  0
description         275114
latitude                 0
longitude                0
category               249
avg_rating               0
num_of_reviews           0
price               272115
hours                43394
MISC                 33636
state                40162
relative_results     15670
url                      0
estado                   0
dtype: int64

Se eliminan todas las columnas que no aportan al proyecto ya sea por gran cantidad de nulos (price,description) o porque son innecesarias.

In [14]:
df_metadata.drop(columns=['index','description','hours','MISC','relative_results','url','price'],inplace=True)

Además se eliminan las filas con los valores faltantes en categoría ya que es de suma importancia para el proyecto.

In [15]:
df_metadata.dropna(subset='category',inplace=True)

In [16]:
df_metadata.isnull().sum()

name                  2
address            2777
gmap_id               0
latitude              0
longitude             0
category              0
avg_rating            0
num_of_reviews        0
state             40050
estado                0
dtype: int64

In [17]:
df_metadata.head(3)

Unnamed: 0,name,address,gmap_id,latitude,longitude,category,avg_rating,num_of_reviews,state,estado
0,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,34.058092,-118.29213,[Korean restaurant],4.4,18,Open ⋅ Closes 6PM,California
1,Nobel Textile Co,"Nobel Textile Co, 719 E 9th St, Los Angeles, C...",0x80c2c632f933b073:0xc31785961fe826a6,34.036694,-118.249421,[Fabric store],4.3,7,Open ⋅ Closes 5PM,California
2,Matrix International Textiles,"Matrix International Textiles, 1363 S Bonnie B...",0x80c2cf163db6bc89:0x219484e2edbcfa41,34.015505,-118.181839,[Fabric store],3.5,6,Open ⋅ Closes 5:30PM,California


Vamos a filtrar la información del dataframe por el tipo de negocio que queramos estudiar, en este caso los restaurantes. Por lo que consideramos 2 palabras clave, *restaurant* y *food* para tener las categorías mas importantes.

In [18]:
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 319632 entries, 0 to 319880
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            319630 non-null  object
 1   address         316855 non-null  object
 2   gmap_id         319632 non-null  object
 3   latitude        319632 non-null  object
 4   longitude       319632 non-null  object
 5   category        319632 non-null  object
 6   avg_rating      319632 non-null  object
 7   num_of_reviews  319632 non-null  object
 8   state           279582 non-null  object
 9   estado          319632 non-null  object
dtypes: object(10)
memory usage: 26.8+ MB


In [19]:
df_filtered=df_metadata.copy()
#Definimos las palabras claves
palabras_claves = ['restaurant', 'food','steakhouse','mexican','pizzeria','american','asian']
#Expando categoría

#Genero una función para filtrar las palabras claves
def filter_list_by_keywords(lst, palabras_claves):
    return [word for word in lst if any(palabra.lower() in word.lower() for palabra in palabras_claves)]

df_filtered['category'] = df_filtered['category'].apply(lambda x: filter_list_by_keywords(x, palabras_claves))

In [20]:
df_filtered = df_filtered[df_filtered['category'].apply(lambda x: len(x) > 0)]

In [21]:
#Verificamos los tipo de datos de categoria
cantidad=df_filtered.explode('category')
cantidad['category'].value_counts()

category
Restaurant                    15622
Fast food restaurant           5001
Pizza restaurant               4548
Takeout Restaurant             3994
Mexican restaurant             3401
                              ...  
Welsh restaurant                  1
South Indian restaurant           1
Couscous restaurant               1
Catalonian restaurant             1
Tempura donburi restaurant        1
Name: count, Length: 300, dtype: int64

In [22]:
df_filtered.reset_index(drop=True, inplace=True)

Eliminamos a continuación los restaurantes que están permantemente cerrados.

In [26]:
df_filtered=df_filtered[df_filtered['state']!='Permanently closed']

Analizamos los duplicados

In [27]:
df_filtered['gmap_id'].duplicated().sum()

0

Como vemos no tenemos dos restaurants con el mismo gmap_id por lo que esto esta en lo correcto.

In [28]:
df_filtered['name'].duplicated().sum()

7774

In [29]:
df_filtered[df_filtered['name'].duplicated()]

Unnamed: 0,name,address,gmap_id,latitude,longitude,category,avg_rating,num_of_reviews,state,estado
70,McDonald's,"McDonald's, 1000 Palisades Center Dr, West Nya...",0x89c2e9cf8e139235:0x24bfb20e9e09f260,41.097768,-73.955392,"[Fast food restaurant, Breakfast restaurant, H...",2.2,18,,New_York
71,McDonald's,"McDonald's, 341 5th Ave, New York, NY 10016",0x89c259a9b55adb77:0xfe5e87207e736efc,40.747916,-73.984586,[Fast food restaurant],3.1,16,,New_York
91,Subway,"Subway, 230 West Route 59 Store #1, Spring Val...",0x89c2e7f1c5ffad57:0x188085b588f042e3,41.109464,-74.045593,"[Fast food restaurant, Takeout Restaurant, Res...",3.7,27,Temporarily closed,New_York
108,Subway,"Subway, 1970 Broadway Lower Level, Oakland, CA...",0x808f80b286dc6f9b:0xb9c0ee0392825cd4,37.808663,-122.268069,"[Fast food restaurant, Takeout Restaurant, Res...",3.2,15,Temporarily closed,California
186,Subway,"Subway, 630 Old Country Road 1105-D, Garden Ci...",0x89c27d7e84956c31:0x301d481a1d010206,40.738215,-73.612945,"[Fast food restaurant, Takeout Restaurant, Res...",3.5,28,Temporarily closed,New_York
...,...,...,...,...,...,...,...,...,...,...
40505,Pizza Hut,"Pizza Hut, 5050 Peach St, Erie, PA 16509",0x88328086e3b285ab:0x420965f3af5fb51a,42.080021,-80.091371,"[Chicken wings restaurant, Takeout Restaurant,...",3.6,218,Open ⋅ Closes 10PM,Pennsylvania
40506,Vocelli Pizza,"Vocelli Pizza, 30 W Pike St, Canonsburg, PA 15317",0x8834547ffbc4a3bd:0x92bc64fb62cce468,40.258819,-80.187422,"[Pizza restaurant, Takeout Restaurant]",3.8,77,Open ⋅ Closes 1AM,Pennsylvania
40512,Pita Pit,"Pita Pit, 107 Marshall St, Syracuse, NY 13210",0x89d9f3a1e98cc40b:0x5c63dd87ec28164d,43.041768,-76.136254,"[Fast food restaurant, Mediterranean restaurant]",4,105,Open ⋅ Closes 10PM,New_York
40522,Subway,"Subway, Syracuse University, 720 University Av...",0x89d9f3a1b3373329:0xeb55367bab55c39,43.042096,-76.134719,"[Fast food restaurant, Takeout Restaurant, Res...",3.5,28,Open ⋅ Closes 7PM,New_York


Estos duplicados corresponden a grandes cadenas de comida.

In [30]:
df_prueba=df_filtered.copy()

In [31]:
df_prueba.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31440 entries, 0 to 40528
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            31440 non-null  object
 1   address         31386 non-null  object
 2   gmap_id         31440 non-null  object
 3   latitude        31440 non-null  object
 4   longitude       31440 non-null  object
 5   category        31440 non-null  object
 6   avg_rating      31440 non-null  object
 7   num_of_reviews  31440 non-null  object
 8   state           29336 non-null  object
 9   estado          31440 non-null  object
dtypes: object(10)
memory usage: 2.6+ MB


Continuaremos con el análisis de la direccion para obtener la ciudad y el codigo postal.

In [32]:
df_filtered.head()

Unnamed: 0,name,address,gmap_id,latitude,longitude,category,avg_rating,num_of_reviews,state,estado
0,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,34.058092,-118.29213,[Korean restaurant],4.4,18,Open ⋅ Closes 6PM,California
1,Vons Chicken,"Vons Chicken, 12740 La Mirada Blvd, La Mirada,...",0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,33.916402,-118.010855,[Restaurant],4.5,18,Open ⋅ Closes 9:30PM,California
2,Golden Castle,"Golden Castle, 1906 E 12th St, Austin, TX 78702",0x8644b59b8fe872e5:0x5e638876caa84cc3,30.273985,-97.719563,[Restaurant],4.5,8,Closed ⋅ Opens 5PM,Texas
3,The Nutrition Group,"The Nutrition Group, 5 Interchange Pl, York, P...",0x89c88de475520cc7:0xeff46469445b5212,40.018829,-76.739459,[Food service],3.2,17,Open ⋅ Closes 8PM,Pennsylvania
4,Studio 34 Nutrition,"Studio 34 Nutrition, 3021 A, 34th St, Lubbock,...",0x86fe6dd0642d44eb:0xeaceade94f24cc15,33.563039,-101.880718,[Health food restaurant],5.0,14,Open ⋅ Closes 2PM,Texas


In [33]:
#Vamos a extraer la ciudad y el código postal de la columna address
df_filtered['city'] = df_filtered['address'].str.extract(r',\s*([^,]+),\s*[A-Z]{2}\s+\d{5}', expand=False)
df_filtered['zip_code'] = df_filtered['address'].str.extract(r'(\d{5})$', expand=False)

Borramos ahora la columna *state* que no nos sirve mas y verificamos los nulos.

In [34]:
df_filtered.drop(columns='state', inplace=True)

In [35]:
df_filtered.isnull().sum()

name                 0
address             54
gmap_id              0
latitude             0
longitude            0
category             0
avg_rating           0
num_of_reviews       0
estado               0
city               174
zip_code          3106
dtype: int64

In [36]:
#Creamos una funcion para verificar la ciudad y el estado
def get_location(row):
    coordinates = [(row['latitude'], row['longitude'])]
    location = reverse_geocode.search(coordinates)[0]
    if location['country_code'] == 'US':
        return pd.Series([location['city'], location['state']])
    else:
        return pd.Series([None, None])



df_filtered[['city', 'estado']] = df_filtered.apply(get_location, axis=1)

In [37]:
df_filtered

Unnamed: 0,name,address,gmap_id,latitude,longitude,category,avg_rating,num_of_reviews,estado,city,zip_code
0,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,34.058092,-118.29213,[Korean restaurant],4.4,18,California,Koreatown,90005
1,Vons Chicken,"Vons Chicken, 12740 La Mirada Blvd, La Mirada,...",0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,33.916402,-118.010855,[Restaurant],4.5,18,California,La Mirada,90638
2,Golden Castle,"Golden Castle, 1906 E 12th St, Austin, TX 78702",0x8644b59b8fe872e5:0x5e638876caa84cc3,30.273985,-97.719563,[Restaurant],4.5,8,Texas,Austin,78702
3,The Nutrition Group,"The Nutrition Group, 5 Interchange Pl, York, P...",0x89c88de475520cc7:0xeff46469445b5212,40.018829,-76.739459,[Food service],3.2,17,Pennsylvania,Emigsville,17406
4,Studio 34 Nutrition,"Studio 34 Nutrition, 3021 A, 34th St, Lubbock,...",0x86fe6dd0642d44eb:0xeaceade94f24cc15,33.563039,-101.880718,[Health food restaurant],5,14,Texas,Lubbock,79410
...,...,...,...,...,...,...,...,...,...,...,...
40522,Subway,"Subway, Syracuse University, 720 University Av...",0x89d9f3a1b3373329:0xeb55367bab55c39,43.042096,-76.134719,"[Fast food restaurant, Takeout Restaurant, Res...",3.5,28,New York,Syracuse,13210
40523,Luthun,"Luthun, 432 E 13th St, New York, NY 10009",0x89c2592cf2935ed9:0x6672264426649f94,40.729911,-73.981883,"[New American restaurant, American restaurant,...",4.8,68,New York,East Village,10009
40526,Jia,"Jia, 23 Essex St, New York, NY 10002",0x89c25a29eb1373a1:0x3afffd62d2b9a1dd,40.715434,-73.990083,"[Down home cooking restaurant, Asian fusion re...",4.8,68,New York,Chinatown,10002
40527,China King Express,"China King Express, 6938 Erie Rd, Derby, NY 14047",0x89d31f176b64da79:0x202faca0f650e880,42.698185,-78.988253,[Chinese restaurant],4.1,118,New York,Angola,14047


Los nulos que quedan se completaran con "Sin Datos" para que el áera de análisis haga su trabajo correspondiente.

In [38]:
#Completo los nulos con "Sin Datos"
df_filtered.fillna('Sin Datos', inplace=True)

In [39]:
df_filtered

Unnamed: 0,name,address,gmap_id,latitude,longitude,category,avg_rating,num_of_reviews,estado,city,zip_code
0,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,34.058092,-118.292130,[Korean restaurant],4.4,18,California,Koreatown,90005
1,Vons Chicken,"Vons Chicken, 12740 La Mirada Blvd, La Mirada,...",0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,33.916402,-118.010855,[Restaurant],4.5,18,California,La Mirada,90638
2,Golden Castle,"Golden Castle, 1906 E 12th St, Austin, TX 78702",0x8644b59b8fe872e5:0x5e638876caa84cc3,30.273985,-97.719563,[Restaurant],4.5,8,Texas,Austin,78702
3,The Nutrition Group,"The Nutrition Group, 5 Interchange Pl, York, P...",0x89c88de475520cc7:0xeff46469445b5212,40.018829,-76.739459,[Food service],3.2,17,Pennsylvania,Emigsville,17406
4,Studio 34 Nutrition,"Studio 34 Nutrition, 3021 A, 34th St, Lubbock,...",0x86fe6dd0642d44eb:0xeaceade94f24cc15,33.563039,-101.880718,[Health food restaurant],5,14,Texas,Lubbock,79410
...,...,...,...,...,...,...,...,...,...,...,...
40522,Subway,"Subway, Syracuse University, 720 University Av...",0x89d9f3a1b3373329:0xeb55367bab55c39,43.042096,-76.134719,"[Fast food restaurant, Takeout Restaurant, Res...",3.5,28,New York,Syracuse,13210
40523,Luthun,"Luthun, 432 E 13th St, New York, NY 10009",0x89c2592cf2935ed9:0x6672264426649f94,40.729911,-73.981883,"[New American restaurant, American restaurant,...",4.8,68,New York,East Village,10009
40526,Jia,"Jia, 23 Essex St, New York, NY 10002",0x89c25a29eb1373a1:0x3afffd62d2b9a1dd,40.715434,-73.990083,"[Down home cooking restaurant, Asian fusion re...",4.8,68,New York,Chinatown,10002
40527,China King Express,"China King Express, 6938 Erie Rd, Derby, NY 14047",0x89d31f176b64da79:0x202faca0f650e880,42.698185,-78.988253,[Chinese restaurant],4.1,118,New York,Angola,14047


In [40]:
#renombo las columnas correspondientes
df_filtered.rename(columns={'estado':'state'},inplace=True)

In [41]:
#reordeno las columnas
df_filtered = df_filtered[['gmap_id', 'name', 'address', 'city', 'state', 'zip_code', 'latitude', 'longitude', 'category']]

In [42]:
df_filtered.head()

Unnamed: 0,gmap_id,name,address,city,state,zip_code,latitude,longitude,category
0,0x80c2c778e3b73d33:0xbdc58662a4a97d49,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",Koreatown,California,90005,34.058092,-118.29213,[Korean restaurant]
1,0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,Vons Chicken,"Vons Chicken, 12740 La Mirada Blvd, La Mirada,...",La Mirada,California,90638,33.916402,-118.010855,[Restaurant]
2,0x8644b59b8fe872e5:0x5e638876caa84cc3,Golden Castle,"Golden Castle, 1906 E 12th St, Austin, TX 78702",Austin,Texas,78702,30.273985,-97.719563,[Restaurant]
3,0x89c88de475520cc7:0xeff46469445b5212,The Nutrition Group,"The Nutrition Group, 5 Interchange Pl, York, P...",Emigsville,Pennsylvania,17406,40.018829,-76.739459,[Food service]
4,0x86fe6dd0642d44eb:0xeaceade94f24cc15,Studio 34 Nutrition,"Studio 34 Nutrition, 3021 A, 34th St, Lubbock,...",Lubbock,Texas,79410,33.563039,-101.880718,[Health food restaurant]


Se verifican finalmente que no haya duplicados

In [43]:
#Cuento los duplicados en gmap_id
df_filtered['gmap_id'].duplicated().sum()

0

In [44]:
#Modifíco la columna state para que contenga la abreviacíon de los estados
def abreviacion_estado(row):
    if row['state'] == 'New York':
        return 'NY'
    elif row['state'] == 'California':
        return 'CA'
    elif row['state'] == 'Texas':
        return 'TX'
    elif row['state'] == 'Florida':
        return 'FL'
    elif row['state'] == 'Pennsylvania':
        return 'PA'
    else:
        return 'Sin Datos'

df_filtered['state'] = df_filtered.apply(abreviacion_estado, axis=1)

In [47]:
df_filtered['state'].value_counts()

state
NY           10120
CA            6907
PA            5337
TX            4744
FL            4234
Sin Datos       98
Name: count, dtype: int64

In [45]:
df_restaurantes_clean = df_filtered.copy()

In [46]:
df_restaurantes_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31440 entries, 0 to 40528
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   gmap_id    31440 non-null  object 
 1   name       31440 non-null  object 
 2   address    31440 non-null  object 
 3   city       31440 non-null  object 
 4   state      31440 non-null  object 
 5   zip_code   31440 non-null  object 
 6   latitude   31440 non-null  float64
 7   longitude  31440 non-null  float64
 8   category   31440 non-null  object 
dtypes: float64(2), object(7)
memory usage: 2.4+ MB


In [48]:
df_restaurantes_clean.head(3)

Unnamed: 0,gmap_id,name,address,city,state,zip_code,latitude,longitude,category
0,0x80c2c778e3b73d33:0xbdc58662a4a97d49,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",Koreatown,CA,90005,34.058092,-118.29213,[Korean restaurant]
1,0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,Vons Chicken,"Vons Chicken, 12740 La Mirada Blvd, La Mirada,...",La Mirada,CA,90638,33.916402,-118.010855,[Restaurant]
2,0x8644b59b8fe872e5:0x5e638876caa84cc3,Golden Castle,"Golden Castle, 1906 E 12th St, Austin, TX 78702",Austin,TX,78702,30.273985,-97.719563,[Restaurant]


In [49]:
df_restaurantes_clean.to_parquet('Datos limpios/restaurantes_clean.parquet')

In [50]:
df_dummies_restaurantes=df_restaurantes_clean[['gmap_id','category']]
categoria=df_dummies_restaurantes['category'].str.get_dummies(sep=',')

In [51]:
df_dummies_restaurantes=pd.concat([df_dummies_restaurantes,categoria],axis=1)

In [52]:
df_dummies_restaurantes.reset_index(drop=True, inplace=True)

In [53]:
df_dummies_restaurantes.to_parquet('Datos limpios/dummies_restaurantes2.parquet')

In [54]:
df_restaurantes_clean['state'].value_counts()

state
NY           10120
CA            6907
PA            5337
TX            4744
FL            4234
Sin Datos       98
Name: count, dtype: int64