In [1]:
import pandas as pd
import os
import json
import warnings

In [2]:
def read_all_json_files(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json") and os.path.isfile(os.path.join(folder_path, file_name)):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if line:
                        try:
                            json_obj = json.loads(line)
                            data.append(json_obj)
                        except json.JSONDecodeError as e:
                            warnings.warn(f"Error decoding JSON in {file_path}: {str(e)}")
                            continue
    df = pd.DataFrame(data)
    return df

In [4]:
def process_state_reviews(df_combined, folder_path):
    
    # Convertir la columna "time" al formato adecuado
    df_combined['time'] = pd.to_datetime(df_combined['time'], unit='ms')

    # Crear columnas "date" a partir de la columna "time"
    df_combined['date'] = df_combined['time'].dt.strftime('%Y-%m-%d')
    df_combined['date'] = pd.to_datetime(df_combined['date'], format='%Y-%m-%d')
    
    # Filtrar por fecha
    df_combined = df_combined[df_combined['date'].dt.year >= 2018]

    # Desanidar la columna "resp"
    df_combined['resp'] = df_combined['resp'].apply(lambda x: x['text'] if isinstance(x, dict) else None)
    
    # renombrar columna
    df_combined.rename(columns={'text':'opinion'}, inplace=True)
    
    # Dropear las columnas que no se usarán
    df_combined.drop(["pics", "name", "time"], axis=1, inplace=True) 

    # Eliminar valores duplicados
    df_combined.drop_duplicates(inplace=True, ignore_index=True)

    # Comparar el nombre del estado y asignar la inicial correspondiente a través del DataFrame
    if 'California' in folder_path:
        df_combined['state'] = 'CA'
    elif 'Delaware' in folder_path:
        df_combined['state'] = 'DE'
    elif 'Illinois' in folder_path:
        df_combined['state'] = 'IL'
    elif 'Idaho' in folder_path:
        df_combined['state'] = 'ID'
    elif 'Louisiana' in folder_path:
        df_combined['state'] = 'LA'
    elif 'Missouri' in folder_path:
        df_combined['state'] = 'MO'

    return df_combined

In [3]:
def process_states(states_folders):
    combined_df = pd.DataFrame()  # DataFrame para almacenar todos los estados combinados
    
    for state_folder in states_folders:
        state_folder_path = os.path.join('reviews-estados', state_folder)
        state_df = read_all_json_files(state_folder_path)
        processed_df = process_state_reviews(state_df, state_folder)
        combined_df = pd.concat([combined_df, processed_df], ignore_index=True)
    
    return combined_df


In [None]:
# Lista de estados específicos que se desean procesar
selected_states = ['review-California', 'review-Delaware', 'review-Idaho', 'review-Illinois', 'review-Louisiana', 'review-Missouri']

# Llamada a la función para procesar los estados específicos y combinar los DataFrames
combined_dataframe = process_states(selected_states)

In [7]:
combined_dataframe.sample(10)

Unnamed: 0,user_id,rating,opinion,resp,gmap_id,date,state
7322365,113144702125701769443,4,,,0x8626a55c96cf3ad7:0x1ac25809967319ca,2018-04-15,LA
386373,114172995184752944226,4,"Large variety of groceries, just a little high...",,0x8090514e30cc62fd:0x1c705c050b7a8153,2019-06-14,CA
2352127,100934675449763776509,5,"Amazing always! Food is fantastic, and Hannah...",Thanks so much for the kind words and for taki...,0x89b8c642e5b8ea4f:0xe02761f49ae0a112,2018-10-17,DE
3055510,113080597646207060523,5,Great seafood selection. Sushi is hand rolled ...,,0x54ae55d4185f214d:0xc3a8c64c8f2436b9,2018-03-20,ID
200445,105576052844902509927,4,"Extremely good, fresh burgers and unique sodas...",,0x80deacc98047f5cd:0x7a46c0b38507d346,2018-03-14,CA
5206163,106492660619351940670,5,I like the constant changing variety of items,,0x880e32ad4e72edef:0xc78e405246a62ad,2018-05-24,IL
5636326,108336874172464122473,3,Let me start off by saying that the food is pr...,,0x880fcd1daa59e491:0xa76e26d3487f36e5,2018-07-07,IL
4409794,107591931749135766408,5,"What can I say about Biscuit & Hogs, this plac...",Thank you so much. We truly appreciate this fe...,0x54ae514562efffff:0x2aa0406129467f70,2021-01-28,ID
8460058,109678238755965508413,5,Awesome place to go to very nice people to dea...,,0x87d963c6b9e47dad:0x945ac9933e2535f0,2019-06-15,MO
5347376,103608017734888016124,5,Great work highly recommended!,We are thrilled your experience with us was 5-...,0x880f05adbeb21dd3:0xf5de10d5c4e7b42e,2020-03-04,IL


In [8]:
# Se guarda el dataframe en parquet
combined_dataframe.to_parquet('review_estados_google_maps.parquet', index=False)