Preprocesamiento para una Muestra Aleatoria de 3,000 Datos

In [31]:
import pandas as pd
import re
from transformers import DistilBertTokenizer

In [32]:
# Cargar datos
file_path = "reviews_cleaned_2020_2022.parquet"
df = pd.read_parquet(file_path)

In [33]:
# Seleccionar una muestra aleatoria de 3,000 datos
sample_df = df.sample(n=3000, random_state=42)

In [34]:
# Preprocesamiento del texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar caracteres especiales y números
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Eliminar espacios adicionales
    text = re.sub(r"\s+", " ", text).strip()
    return text

sample_df['text_clean'] = sample_df['text_clean'].apply(preprocess_text)

In [35]:
# Convertir estrellas a etiquetas de sentimiento
def map_sentiment(stars):
    if stars <= 2:
        return "negative"
    elif stars == 3:
        return "neutral"
    else:
        return "positive"

sample_df['sentiment'] = sample_df['stars'].apply(map_sentiment)


In [39]:
# Inicializar el tokenizador
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")


In [40]:
# Función para tokenizar el texto y extraer solo los ids y la máscara de atención como listas
def tokenize_text(text):
    encoding = tokenizer(text, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    return {
        'input_ids': encoding['input_ids'].squeeze().tolist(),
        'attention_mask': encoding['attention_mask'].squeeze().tolist()
    }

In [41]:
# Aplicar tokenización
sample_df['tokens'] = sample_df['text_clean'].apply(tokenize_text)

In [42]:
# Inspección final
print(sample_df.head())

                            id                 user_id  stars  \
550654                  855152   109830888913186427251    3.0   
77703   9MJaVPd7_UnaKaENGPGQaA  1DBepcTVu6Vymj8Cq5Qfxw    5.0   
404019                   24418   110434499591530608348    4.0   
163501  CWnjt0cQqYkJQsANOkSkew  SvQVq_-qneq_UCDy5jiaWA    5.0   
88496   VKN9P3PtPQqWxaOEOQQuqw  4lQeJ5-1pPORCvbGKwBDQA    5.0   

                                                     text  \
550654  Be prepared to wait, so you end up drinking a ...   
77703   This was so yummy! Everything I had was so goo...   
404019  Usually the cheapest gas in the area and surro...   
163501  JAMIE & CHIEF TIFFANY are so great at what the...   
88496   Now that's what you call a Po Boy!!! I ordered...   

                          date origin id_business  \
550654 2020-01-26 12:55:06.114      G       31267   
77703  2020-01-17 02:37:50.000      Y       45271   
404019 2021-04-20 05:31:02.877      G       32023   
163501 2021-06-27 19:36:35.000

In [None]:
# Guardar el DataFrame preprocesado en un archivo Parquet
sample_df.to_parquet("sample_preprocessed.parquet", index=False)

In [46]:
# Guardar el DataFrame preprocesado en un archivo CSV
sample_df.to_csv("sample_preprocessed.CSV", index=False)