In [1]:
import ast              # Módulo de Árboles de Sintaxis Abstracta
import pandas as pd     # Librería para manipular datasets
import pyarrow as pa    # Útil para operaciones de lectura y escritura de datos
import pyarrow.parquet as pq   # Útil para leer y escribir datos en formato Parquet de manera eficiente
import os               # creación de directorios y comprobación de existencia
import matplotlib.pyplot as plt
from textblob import TextBlob

In [4]:
# Lista para almacenar los DataFrames de cada parte
dfs = []

# Leer cada archivo Parquet y agregar su DataFrame a la lista
for i in range(6):  # Recuerda que los archivos van de 0 a 5
    nombre_archivo = f'Reviews_{i}.parquet'
    df_parte = pd.read_parquet(nombre_archivo)
    dfs.append(df_parte)

# Concatenar los DataFrames en un solo DataFrame
df_treviews = pd.concat(dfs, ignore_index=True)

# Ahora df_completo contiene todos los datos de los archivos Parquet combinados en un solo DataFrame

In [8]:
df_treviews

Unnamed: 0,review_id,user_id,business_id,rating,text,date,name
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Melanie
1,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,Debra
2,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,Kyle
3,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,Sophia
4,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,I am a long term frequent customer of this est...,2015-09-23 23:10:31,Q
...,...,...,...,...,...,...,...
5807672,H0RIamZu0B0Ei0P4aeh3sQ,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5.0,Latest addition to services from ICCU is Apple...,2014-12-17 21:45:20,Carl
5807673,shTPgbgdwTHSuU67mGCmZQ,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5.0,"This spot offers a great, affordable east week...",2021-03-31 16:55:10,Angela
5807674,YNfNhgZlaaCO5Q_YJR4rEw,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4.0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30,Jesse
5807675,i-I4ZOhoX70Nw5H0FwrQUA,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5.0,For when I'm feeling like ignoring my calorie-...,2022-01-19 18:59:27,Catherine


In [9]:
df_treviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5807677 entries, 0 to 5807676
Data columns (total 7 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   rating       float64       
 4   text         object        
 5   date         datetime64[ns]
 6   name         object        
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 310.2+ MB


In [10]:
data_type_process = {
    "columna": df_treviews.columns.tolist(), 
    "tipo": [df_treviews[columna].apply(type).unique() for columna in df_treviews.columns]
}

#Se visualizara el nombre de la columna y su tipo de dato
data_type = pd.DataFrame(data_type_process)
data_type

Unnamed: 0,columna,tipo
0,review_id,[<class 'str'>]
1,user_id,[<class 'str'>]
2,business_id,[<class 'str'>]
3,rating,[<class 'float'>]
4,text,[<class 'str'>]
5,date,[<class 'pandas._libs.tslibs.timestamps.Timest...
6,name,"[<class 'str'>, <class 'NoneType'>]"


In [12]:
#se utiliza la variable duplicados para guardar la busqueda y poder comparar
duplicados= df_treviews.loc[df_treviews.duplicated()]
duplicados

Unnamed: 0,review_id,user_id,business_id,rating,text,date,name


In [13]:
# se utiliza para eliminar duplicados
df_categoria = df_treviews.drop_duplicates(keep='first')

In [14]:
#se utiliza para sumar los nulos
nulos= df_treviews.isnull().sum()
nulos

review_id       0
user_id         0
business_id     0
rating          0
text            0
date            0
name           33
dtype: int64

In [17]:
# se eliminan valores nulos por columnas
df_treviews = df_treviews.dropna(subset=['name'])
#df_categoria = df_categoria.dropna(subset=['#columna con nulos'])

In [18]:
#se utiliza para sumar los nulos
nulos= df_treviews.isnull().sum()
nulos

review_id      0
user_id        0
business_id    0
rating         0
text           0
date           0
name           0
dtype: int64