In [1]:
import os
import json
import numpy as np
import pandas as pd

In [3]:
# Ruta de la carpeta que contiene los archivos JSON
folder_data = "C:/Users/54280/Documents/GitHub/proyectogrupal/Pruebas/metadata-sitios"

# Lista para almacenar los DataFrames de datos JSON
data_frames = []

# Itero sobre los archivos JSON en la carpeta
for filename in os.listdir(folder_data):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_data, filename)
        # Lee el archivo JSON y creo un DataFrame
        with open(file_path, 'r', encoding='utf-8') as file:
            data = pd.read_json(file, lines=True)
        data_frames.append(data)

# Combino todos los DataFrames de datos JSON en uno
combined_data = pd.concat(data_frames, ignore_index=True)



In [7]:
# Dividir el DataFrame en lotes más pequeños
batch_size = 10000  # Tamaño de lote deseado

# Calcula la cantidad total de lotes
num_batches = len(combined_data) // batch_size + 1

# Inicializa una lista para almacenar los DataFrames resultantes de cada lote
result_dfs = []

# Procesa los datos en lotes
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_df = combined_data.iloc[start_idx:end_idx]  # Obtiene el lote actual
    
    # Usa explode y value_counts en el lote actual
    batch_exploded = batch_df.explode('category')
    batch_counts = batch_exploded['category'].value_counts()
    
    # Convierte el resultado en un DataFrame y agrega a la lista
    batch_counts_df = batch_counts.reset_index()
    batch_counts_df.columns = ['category', 'count']
    result_dfs.append(batch_counts_df)

# Combina los DataFrames de resultados en uno solo
final_result_df = pd.concat(result_dfs)

In [9]:
final_result_df.to_csv('LISTADO_CATEGORIAS')

In [4]:
combined_data.head(2)

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,Porter Pharmacy,"Porter Pharmacy, 129 N Second St, Cochran, GA ...",0x88f16e41928ff687:0x883dad4fd048e8f8,,32.3883,-83.3571,[Pharmacy],4.9,16,,"[[Friday, 8AM–6PM], [Saturday, 8AM–12PM], [Sun...","{'Service options': ['In-store shopping', 'Sam...",Open ⋅ Closes 6PM,"[0x88f16e41929435cf:0x5b2532a2885e9ef6, 0x88f1...",https://www.google.com/maps/place//data=!4m2!3...
1,City Textile,"City Textile, 3001 E Pico Blvd, Los Angeles, C...",0x80c2c98c0e3c16fd:0x29ec8a728764fdf9,,34.018891,-118.21529,[Textile exporter],4.5,6,,,,Open now,"[0x80c2c624136ea88b:0xb0315367ed448771, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...


In [8]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3025011 entries, 0 to 3025010
Data columns (total 15 columns):
 #   Column            Dtype  
---  ------            -----  
 0   name              object 
 1   address           object 
 2   gmap_id           object 
 3   description       object 
 4   latitude          float64
 5   longitude         float64
 6   category          object 
 7   avg_rating        float64
 8   num_of_reviews    int64  
 9   price             object 
 10  hours             object 
 11  MISC              object 
 12  state             object 
 13  relative_results  object 
 14  url               object 
dtypes: float64(3), int64(1), object(11)
memory usage: 346.2+ MB


In [9]:
combined_data.isnull().sum()

name                     37
address               80511
gmap_id                   0
description         2770722
latitude                  0
longitude                 0
category              17419
avg_rating                0
num_of_reviews            0
price               2749808
hours                787405
MISC                 690834
state                746455
relative_results     295058
url                       0
dtype: int64

In [10]:
combined_data.to_parquet('metadata_etl_gmap.parquet')