In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
import nltk
from nltk.corpus import stopwords
import shutil

from argparse import Namespace
arg = Namespace()

import sys
import os

if os.path.abspath("../src") not in sys.path:
    sys.path.append(os.path.abspath("../src"))

import preprocessing as pp



In [4]:
df = pd.read_csv("../data/Rest-Mex_2025_train.csv")
df.head()

Unnamed: 0,Title,Review,Polarity,Town,Region,Type
0,Mi Lugar Favorito!!!!,Excelente lugar para comer y pasar una buena n...,5.0,Sayulita,Nayarit,Restaurant
1,lugares interesantes para visitar,"andar mucho, así que un poco difícil para pers...",4.0,Tulum,QuintanaRoo,Attractive
2,No es el mismo Dreams,"Es nuestra cuarta visita a Dreams Tulum, elegi...",3.0,Tulum,QuintanaRoo,Hotel
3,un buen panorama cerca de CancÃºn,"Estando en CancÃºn, fuimos al puerto y tomamos...",4.0,Isla_Mujeres,QuintanaRoo,Attractive
4,El mejor,Es un lugar antiguo y por eso me encanto tiene...,5.0,Patzcuaro,Michoacan,Hotel


### Preprocesado

In [5]:
# Arregla los mojibakes
print(df["Title"].iloc[3])
df['Title'] = df['Title'].fillna('').apply(pp.arregla_mojibake)
df['Review'] = df['Review'].fillna('').apply(pp.arregla_mojibake)
print(df["Title"].iloc[3])

un buen panorama cerca de CancÃºn
un buen panorama cerca de Cancún


In [6]:
# Quita stopwords
df['Texto_Limpio'] = (df['Title'].fillna('') + ' ' + df['Review'].fillna('')).apply(pp.quita_stopwords)

In [7]:
df_targetType = pd.concat([df["Texto_Limpio"], df["Type"]], axis = 1)
df_targetType.head()

Unnamed: 0,Texto_Limpio,Type
0,lugar favorito excelente lugar comer pasar bue...,Restaurant
1,lugares interesantes visitar andar así difícil...,Attractive
2,mismo dreams cuarta visita dreams tulum elegim...,Hotel
3,buen panorama cerca cancún cancún puerto tomam...,Attractive
4,mejor lugar antiguo encanto área juegos gigant...,Hotel


In [8]:
# Label Encoder
le = LabelEncoder()
le.fit(df_targetType["Type"])
print(le.classes_)

['Attractive' 'Hotel' 'Restaurant']


In [9]:
# Vectorizer TFIDF
arg.tfidf_max_features = 2500
arg.tfidf_ngram_range = (1, 2)
arg.token_pattern=r'(?u)\b[^\d\W]+\b'

tfidf_vectorizer = TfidfVectorizer(max_features=arg.tfidf_max_features, 
                                   ngram_range=arg.tfidf_ngram_range, 
                                   token_pattern=arg.token_pattern)
tfidf_vectorizer.fit(df_targetType["Texto_Limpio"])
tfidf_vectorizer.get_feature_names_out()

array(['abajo', 'abierta', 'abierto', ..., 'único', 'único inconveniente',
       'únicos'], dtype=object)

### Cargamos en un Dataset

In [11]:
class restaurantDataset(Dataset):
    def __init__(self, data : pd.DataFrame, vectorizer, label_encoder,
                 y : str, use_pca = False, pca = None, n_components = 3):
        """
        Initializes the restaurantDataset class.

        Args:
            data (pd.DataFrame): The input dataframe containing the text and labels.
            vectorizer (TfidfVectorizer): The TF-IDF vectorizer to transform text data.
            label_encoder (LabelEncoder): The label encoder to transform labels into numerical format.
            y (str): The column name in the dataframe representing the target labels.
            use_pca (bool, optional): Whether to apply PCA for dimensionality reduction. Default is False.
            pca (PCA, optional): The PCA object to use for dimensionality reduction. Required if use_pca is True.
            n_components (int, optional): The number of principal components to retain if PCA is applied. Default is 3.

        Attributes:
            data (pd.DataFrame): The input dataframe.
            n_samples (int): The number of samples in the dataset.
            X (torch.Tensor): The transformed feature matrix (TF-IDF or PCA-reduced).
            y (torch.Tensor): The transformed target labels.
            fitted_pca (PCA or None): The fitted PCA object if PCA is applied, otherwise None.

        Returns:
            None
        """
        self.data = data
        self.n_samples = len(data)
        
        # Transform text to TF-IDF vectors
        tfidf_matrix = vectorizer.transform(data["Texto_Limpio"])

        if use_pca:
            pca.fit(tfidf_matrix.toarray())
            self.X = torch.tensor(pca.transform(tfidf_matrix.toarray()), dtype=torch.float32)
            self.fitted_pca = pca
        else:
            self.X = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32)
            self.fitted_pca = None
        
        # Transform labels to numbers
        self.y = torch.tensor(label_encoder.transform(data[y]),
                              dtype=torch.long)

    def __len__(self):
        return self.n_samples

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
arg.use_pca = True
arg.pca_n_components = 3
arg.pca = PCA(n_components=arg.pca_n_components)

dataset = restaurantDataset(df_targetType, tfidf_vectorizer, le,
                               use_pca=arg.use_pca, pca=arg.pca,
                               n_components=arg.pca_n_components)

### DataLoader

In [40]:
# Separamos en conjuntos de entrenamiento y de prueba.
arg.random_state = 42
X_train, X_test, y_train, y_test = train_test_split(dataset.X, dataset.y, 
                                                    test_size=0.2, 
                                                    random_state=arg.random_state,
                                                    stratify=dataset.y)

train_data = torch.utils.data.TensorDataset(X_train, y_train)
test_data = torch.utils.data.TensorDataset(X_test, y_test)

print("Longitud del train set", len(train_data))
print("Longitud del test set", len(test_data))

# Dataloaders
arg.batch_size = 128
train_dataloader = DataLoader(train_data, batch_size=arg.batch_size)
test_dataloader = DataLoader(test_data, batch_size=arg.batch_size)

for i, (X, y) in enumerate(test_dataloader):
    print(f"Shape of X [N, C]: {X.shape} {X.dtype}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Longitud del train set 166440
Longitud del test set 41611
Shape of X [N, C]: torch.Size([128, 3]) torch.float32
Shape of y: torch.Size([128]) torch.int64


### Primer modelo - Red Neuronal

In [41]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

class PrimerModelo(nn.Module):
    def __init__(self, input_size, output_size=3):
        super(PrimerModelo, self).__init__()
        self.fc1 = nn.Linear(input_size, 8)
        self.fc2 = nn.Linear(8, 6)
        self.fc3 = nn.Linear(6, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        logits = self.fc3(x)
        return logits
    
    
model = PrimerModelo(input_size=3).to(device)
print(model)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

Using cuda device
PrimerModelo(
  (fc1): Linear(in_features=3, out_features=8, bias=True)
  (fc2): Linear(in_features=8, out_features=6, bias=True)
  (fc3): Linear(in_features=6, out_features=3, bias=True)
)


In [42]:
def train(dataloader, model, loss_fn, optimizer):
    """
    Entrena el modelo con los datos del dataloader y actualiza los pesos del modelo.
    
    Inputs:
    - dataloader: DataLoader con los datos de entrenamiento.
    - model: Modelo a entrenar.
    - loss_fn: Función de pérdida.
    - optimizer: Optimizador.
    """
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [43]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return 100*correct

In [44]:
def save_checkpoint(state, is_best, checkpoint_path, filename="checkpoint.pth",
                    best_filename="model_best.pth"):
    """
    Save the model checkpoint to the specified path.

    Args:
        state (dict): The state of the model to save, typically includes model weights and optimizer state.
        is_best (bool): If True, saves a copy of the checkpoint as "model_best.pth".
        checkpoint_path (str): The directory where the checkpoint will be saved.
        filename (str): The name of the checkpoint file. Default is "checkpoint.pth".

    Returns:
        None
    """
    filepath = os.path.join(checkpoint_path, filename)
    torch.save(state, filepath)
    if is_best:
        shutil.copyfile(filepath, os.path.join(checkpoint_path, best_filename))

In [45]:
# Training hyperparameters
arg.lr = 2.3e-1
arg.epochs = 100
arg.patience = 20

# Scheduler hyperparameters
arg.lr_patience = 10
arg.lr_factor = 0.5 # Se reduce el learning rate a la mitad cada 10 epochs
# sin mejorar el desempeño

# Saving directory
arg.savedir = "../model"
os.makedirs(arg.savedir, exist_ok=True)

# Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode = "max",
    patience=arg.lr_patience,
    factor=arg.lr_factor,
    verbose=True
)



In [46]:
epochs = 1000
best_metric = 0
n_no_improve = 0

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, criterion, optimizer)
    tuning_metric = test(test_dataloader, model, criterion)
    
    # Update the scheduler
    scheduler.step(tuning_metric)
    
    # Save model checkpoint
    is_best = tuning_metric > best_metric
    if is_best:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1
    
    save_checkpoint({
        "epoch": t + 1,
        "state_dict": model.state_dict(),
        "best_metric": best_metric,
        "optimizer": optimizer.state_dict(),
    }, is_best, arg.savedir, 
                    filename="checkpoint_primerModeloGus.pth", 
                    best_filename="model-best_primerModeloGus.pth")
    
    if n_no_improve >= arg.patience:
        print("No improvement. Breaking out of loop.")
        break
    
print("Ya quedó.")

Epoch 1
-------------------------------
loss: 1.095197  [  128/166440]
loss: 0.573864  [12928/166440]
loss: 0.384915  [25728/166440]
loss: 0.340853  [38528/166440]
loss: 0.378220  [51328/166440]
loss: 0.424934  [64128/166440]
loss: 0.456884  [76928/166440]
loss: 0.367751  [89728/166440]
loss: 0.325119  [102528/166440]
loss: 0.422908  [115328/166440]
loss: 0.361905  [128128/166440]
loss: 0.391873  [140928/166440]
loss: 0.389938  [153728/166440]
loss: 0.238898  [52040/166440]
Test Error: 
 Accuracy: 87.9%, Avg loss: 0.349103 

Epoch 2
-------------------------------
loss: 0.396965  [  128/166440]
loss: 0.399555  [12928/166440]
loss: 0.362228  [25728/166440]
loss: 0.315517  [38528/166440]
loss: 0.360723  [51328/166440]
loss: 0.414383  [64128/166440]
loss: 0.463107  [76928/166440]
loss: 0.366968  [89728/166440]
loss: 0.318717  [102528/166440]
loss: 0.412827  [115328/166440]
loss: 0.353064  [128128/166440]
loss: 0.383943  [140928/166440]
loss: 0.365171  [153728/166440]
loss: 0.241760  [5204