Ejecutado desde VERTEX AI Notebooks Colab Enterprise

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import tempfile
from google.cloud import storage

### Google Reviews

Paths de Entrada y Salida

In [2]:
bucket_name = "json-bucket-datos"
input_folder_name = "Google/reviews-estados-parquet"
output_folder_name = "Google/ETL"
output_file_name = "reviews.parquet"
metadata_file_name = "metadata.parquet"

# Crear cliente de Google Cloud Storage
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)

# Construir la referencia al archivo de origen en la carpeta especificada
read_blob_name = os.path.join(output_folder_name, metadata_file_name)
read_input_blob = bucket.blob(read_blob_name)

# Crear directorio temporal para trabajar con los archivos
temp_dir = tempfile.mkdtemp()

# Asignar el archivo de origen al directorio temporal
read_input_file_path = os.path.join(temp_dir, metadata_file_name)
read_input_blob.download_to_filename(read_input_file_path)

# Crear referencia al archivo de salida de Parquet
output_blob_name = os.path.join(output_folder_name, output_file_name)
output_blob = bucket.blob(output_blob_name)

# Vamos a guardar los datos transformados en un nuevo archivo de Parquet
output_file_path = os.path.join(temp_dir, output_file_name)

Obtenemos ids de archivo metadata

In [3]:
df_metadata = pd.read_parquet(read_input_file_path)

In [4]:
gmap_ids = df_metadata['gmap_id'].tolist()


In [6]:
del df_metadata

Función para limpiar cada archivo

In [19]:
def clean_dataframe(df):
    # Filtramos sólo los hoteles de metadata
    df = df[df['gmap_id'].isin(gmap_ids)]

    # Filtramos Columnas
    df = df[['gmap_id', 'user_id', 'time', 'rating', 'text', 'resp']]

    # Reseteamos índice
    df.reset_index(drop=True, inplace=True)

    return df

Llamar a cada archivo

In [20]:
# Listar archivos parquet en la carpeta
blobs = bucket.list_blobs(prefix=input_folder_name)

# Crear lista para almacenar las tablas parquet
parquet_data = []

for blob in blobs:
    if blob.name.endswith(".parquet"):
        # Bajar archivo parquet
        temp_dir = tempfile.mkdtemp()
        local_file_path = os.path.join(temp_dir, os.path.basename(blob.name))
        blob.download_to_filename(local_file_path)

        # Convertir las tablas parquet a pandas y agregar a la lista
        df = pd.read_parquet(local_file_path)
        df = clean_dataframe(df)
        parquet_data.append(df)

        # Limpiar directorio temporal
        os.remove(local_file_path)
        os.rmdir(temp_dir)



In [21]:
len(parquet_data)

25

In [92]:
# Concatenar los dataframes de pandas
df_reviews = pd.concat(parquet_data, ignore_index=True)

In [93]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6901 entries, 0 to 6900
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   gmap_id  6901 non-null   object
 1   user_id  6901 non-null   object
 2   time     6901 non-null   int64 
 3   rating   6901 non-null   int64 
 4   text     4781 non-null   object
 5   resp     915 non-null    object
dtypes: int64(2), object(4)
memory usage: 323.6+ KB


In [94]:
df_reviews['time'] = pd.to_datetime(df_reviews['time'], unit='ms')

In [95]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6901 entries, 0 to 6900
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   gmap_id  6901 non-null   object        
 1   user_id  6901 non-null   object        
 2   time     6901 non-null   datetime64[ns]
 3   rating   6901 non-null   int64         
 4   text     4781 non-null   object        
 5   resp     915 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 323.6+ KB


In [96]:
df_reviews = df_reviews[(df_reviews['time'].dt.year >= 2016) & (df_reviews['time'].dt.year <= 2020)]

In [100]:
df_reviews.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6476 entries, 0 to 6900
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   gmap_id  6476 non-null   object        
 1   user_id  6476 non-null   object        
 2   time     6476 non-null   datetime64[ns]
 3   rating   6476 non-null   int64         
 4   text     4458 non-null   object        
 5   resp     845 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 354.2+ KB


In [99]:
# Funcion para convertir 'time' a timestamp
def convert_time(resp):
    if resp is not None and 'time' in resp:
        resp['time'] = pd.to_datetime(resp['time'], unit='ms')
    return resp

# aplciar conversion
df_reviews['resp'] = df_reviews['resp'].apply(convert_time)

In [101]:
df_reviews.reset_index(drop=True, inplace=True)

Guardar

In [None]:
# Guardar el DataFrame como archivo Parquet
table = pa.Table.from_pandas(df_reviews)
pq.write_table(table, output_file_path)

# Subir el archivo resultante Parquet a la carpeta
output_blob.upload_from_filename(output_file_path)

# Limpieza del directorio temporal
os.remove(output_file_path)
os.rmdir(temp_dir)