In [1]:
import pandas as pd
import numpy as np
import json
import os

In [3]:
#Se carga el archivo de restaurantes ya limpio para poder obtener las ids de google
# y asi crear un data set solo las reviews de los restaurantes
df_restaurantes= pd.read_parquet('Datos limpios/restaurantes_clean.parquet')
gmaps=df_restaurantes['gmap_id'].astype(str).tolist()

In [4]:
#Hago un diccionarion con las rutas de los archivos de las reviews
ruta_review_estados={
    'NY': 'Datasets/Google Maps/reviews-estados/review-New_York',
    'CA': 'Datasets/Google Maps/reviews-estados/review-California',
    'TX': 'Datasets/Google Maps/reviews-estados/review-Texas',
    'FL': 'Datasets/Google Maps/reviews-estados/review-Florida',
    'PA': 'Datasets/Google Maps/reviews-estados/review-Pennsylvania',

}

In [5]:
reviews=[]
#Recorro todos los archivos y solo me quedo con las reviews de los restaurantes
for ruta in ruta_review_estados.values():
#Por cada ruta se recorren todos los archvivos
    for archivo in os.listdir(ruta):
        ruta_archivo=os.path.join(ruta, archivo)
        with open(ruta+'/'+archivo,'r') as file:
            for linea in file:
                data = json.loads(linea)
                reviews.append(data)
            
       

In [6]:
#Ahora transformamos el diccionario en un data frame
df_reviews=pd.DataFrame(reviews)

In [7]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12946824 entries, 0 to 12946823
Data columns (total 8 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   user_id  object
 1   name     object
 2   time     int64 
 3   rating   int64 
 4   text     object
 5   pics     object
 6   resp     object
 7   gmap_id  object
dtypes: int64(2), object(6)
memory usage: 790.2+ MB


In [8]:
#Eliminamos las columnas que no nos interesan
df_reviews.drop(columns=['pics','resp'],inplace=True)

In [9]:
#Unimos los dos dataframes
df_reviews=df_reviews.merge(df_restaurantes['gmap_id'],on='gmap_id',how='inner')

In [10]:
#Reordenamos las columnas 
df_reviews=df_reviews[['gmap_id','name','rating','time','text']]

In [11]:
#Modificamos el formato de la fecha
df_reviews['time']=pd.to_datetime(df_reviews['time'], unit='ms')

In [12]:
#Nos quedamos con las reviews de los años 2016 a 2021
df_reviews=df_reviews[(df_reviews['time'].dt.year>=2016) & (df_reviews['time'].dt.year<=2021)]

In [13]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4587893 entries, 0 to 4645724
Data columns (total 5 columns):
 #   Column   Dtype         
---  ------   -----         
 0   gmap_id  object        
 1   name     object        
 2   rating   int64         
 3   time     datetime64[ns]
 4   text     object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 210.0+ MB


In [14]:
df_reviews.head()

Unnamed: 0,gmap_id,name,rating,time,text
0,0x89c261f60bdf13db:0x38da730e4687a97b,Maria Patricia Londoño,4,2021-08-16 01:39:58.394,The donuts is always a good place to buy somet...
1,0x89c261f60bdf13db:0x38da730e4687a97b,Kristal,5,2020-07-01 15:17:58.555,I went into this Dunkin' yesterday and got som...
2,0x89c261f60bdf13db:0x38da730e4687a97b,Efrain Hernandez,3,2020-07-07 19:42:23.631,Bought a Machiato and as soon as I touched the...
3,0x89c261f60bdf13db:0x38da730e4687a97b,Kiyoshi Sudo,5,2019-08-02 20:05:40.103,"Friendly staffs, nice donuts and muffins and c..."
4,0x89c261f60bdf13db:0x38da730e4687a97b,Charlotte Sheppard,1,2020-10-11 15:14:31.233,They got my order wrong food wasn't done unco...


In [15]:
#hacemos un analisis de sentiminetos con blob
from textblob import TextBlob
#hacemos una funcion que en caso de no tener text vacio devuelva 1
def calculate_sentiment(text):
    if text is None:
        return 0
    else:
        # Redondea el valor de la polaridad a 2 decimales
        return round(TextBlob(text).sentiment.polarity, 2)
df_reviews['sentiment'] = df_reviews['text'].apply(calculate_sentiment)
#redondeamos los numeros
df_reviews['sentiment']=df_reviews['sentiment'].round(2)
#le restamos 1
df_reviews['sentiment']=df_reviews['sentiment']-1


In [18]:
df_reviews['sentiment'].value_counts()

sentiment
-1.00    2138007
 0.00     170225
-0.30     152151
-0.20     136367
-0.50     131330
          ...   
-1.99         40
-1.92         26
-1.82         24
-1.96         11
-1.97          6
Name: count, Length: 201, dtype: int64

In [None]:
df_reviews.to_parquet('Datos limpios/reviews_restaurantes.parquet')