In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import re
import string
import nltk
from nltk.corpus import stopwords
import shutil

from argparse import Namespace
arg = Namespace()

import sys
import os

if os.path.abspath("../src") not in sys.path:
    sys.path.append(os.path.abspath("../src"))

import preprocessing as pp
import config
import importlib

importlib.reload(pp)

<module 'preprocessing' from 'c:\\Users\\Gus\\Documents\\proyectos\\REST-MEX-2025\\src\\preprocessing.py'>

In [2]:
train_set = pd.read_csv(config.TRAIN_FILE)
train_set.head()

Unnamed: 0,Title,Review,Polarity,Town,Region,Type
0,Mi Lugar Favorito!!!!,Excelente lugar para comer y pasar una buena n...,5.0,Sayulita,Nayarit,Restaurant
1,lugares interesantes para visitar,"andar mucho, así que un poco difícil para pers...",4.0,Tulum,QuintanaRoo,Attractive
2,No es el mismo Dreams,"Es nuestra cuarta visita a Dreams Tulum, elegi...",3.0,Tulum,QuintanaRoo,Hotel
3,un buen panorama cerca de CancÃºn,"Estando en CancÃºn, fuimos al puerto y tomamos...",4.0,Isla_Mujeres,QuintanaRoo,Attractive
4,El mejor,Es un lugar antiguo y por eso me encanto tiene...,5.0,Patzcuaro,Michoacan,Hotel


In [37]:
X = train_set.drop(columns=config.TARGETS)
y1 = train_set[config.TARGET1] # Polarity
y2 = train_set[config.TARGET2] # Town
y3 = train_set[config.TARGET3] # Type

X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(X, y3, test_size=0.2, random_state=42)

### Preprocesado

In [27]:
def preproccess_pipeline():
    """
    This function defines a preprocessing pipeline for text data.

    Steps in the pipeline:
    1. 'Arreglar mojibakes': Fixes mojibake issues in the specified text columns.
    2. 'Minúsculas y quitar stopwords': Converts text to lowercase and removes stopwords from the specified text columns.
    3. 'Quitar features no deseadas': Drops unwanted features from the specified text columns.

    Args:
        classifier: The classifier model to be used (not utilized in the pipeline itself).

    Returns:
        loan_pipe: A scikit-learn Pipeline object that applies the preprocessing steps sequentially.
    """
    loan_pipe = Pipeline(
    [
        ('Arreglar mojibakes', pp.ArreglaMojibake(config.TEXT_COLUMNS)),
        ("Minúsculas y quitar stopwords", pp.QuitaStopwords(config.TEXT_COLUMNS)),
        ("Guardar en una columna", pp.JuntarFeatures(config.TEXT_COLUMNS, config.NEW_COLUMN)),
        ("Quitar features no deseadas", pp.DropFeatures(config.TEXT_COLUMNS))
    ]
    )
    return loan_pipe

In [38]:
X1_train = preproccess_pipeline().fit_transform(X1_train)
X2_train = preproccess_pipeline().fit_transform(X2_train)
X3_train = preproccess_pipeline().fit_transform(X3_train)

In [41]:
# Label Encoder
le = LabelEncoder()
y3_train

70541     Attractive
186074    Attractive
124863    Attractive
79954          Hotel
176039    Restaurant
             ...    
119879    Restaurant
103694    Attractive
131932    Attractive
146867         Hotel
121958    Restaurant
Name: Type, Length: 166440, dtype: object

In [7]:
# Vectorizer TFIDF
arg.tfidf_max_features = 2500
arg.tfidf_ngram_range = (1, 2)
arg.token_pattern=r'(?u)\b[^\d\W]+\b'

tfidf_vectorizer = TfidfVectorizer(max_features=arg.tfidf_max_features, 
                                   ngram_range=arg.tfidf_ngram_range, 
                                   token_pattern=arg.token_pattern)
tfidf_vectorizer.fit(df_targetType["Texto_Limpio"])
tfidf_vectorizer.get_feature_names_out()

array(['abajo', 'abierta', 'abierto', ..., 'único', 'único inconveniente',
       'únicos'], dtype=object)

### Cargamos en un Dataset

In [16]:
class restaurantDataset(Dataset):
    def __init__(self, data : pd.DataFrame, vectorizer, label_encoder,
                 y : str, use_pca = False, pca = None, n_components = 3,
                 standardize = False):
        """
        Initializes the restaurantDataset class.

        Args:
            data (pd.DataFrame): The input dataframe containing the text and labels.
            vectorizer (TfidfVectorizer): The TF-IDF vectorizer to transform text data.
            label_encoder (LabelEncoder): The label encoder to transform labels into numerical format.
            y (str): The column name in the dataframe representing the target labels.
            use_pca (bool, optional): Whether to apply PCA for dimensionality reduction. Default is False.
            pca (PCA, optional): The PCA object to use for dimensionality reduction. Required if use_pca is True.
            n_components (int, optional): The number of principal components to retain if PCA is applied. Default is 3.

        Attributes:
            data (pd.DataFrame): The input dataframe.
            n_samples (int): The number of samples in the dataset.
            X (torch.Tensor): The transformed feature matrix (TF-IDF or PCA-reduced).
            y (torch.Tensor): The transformed target labels.
            fitted_pca (PCA or None): The fitted PCA object if PCA is applied, otherwise None.

        Returns:
            None
        """
        self.data = data
        self.n_samples = len(data)
        
        # Transform text to TF-IDF vectors
        tfidf_matrix = vectorizer.transform(data["Texto_Limpio"])

        if use_pca:
            pca.fit(tfidf_matrix.toarray())
            self.X = torch.tensor(pca.transform(tfidf_matrix.toarray()), dtype=torch.float32)
            self.fitted_pca = pca
        else:
            self.X = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32)
            self.fitted_pca = None
        
        if standardize:
            scaler = StandardScaler()
            self.X = torch.tensor(scaler.fit_transform(self.X), dtype=torch.float32)
        
        
        # Transform labels to numbers
        self.y = torch.tensor(label_encoder.transform(data[y]),
                              dtype=torch.long)

    def __len__(self):
        return self.n_samples

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [18]:
arg.use_pca = True
arg.pca_n_components = 3
arg.pca = PCA(n_components=arg.pca_n_components)
arg.standardize = True
arg.y = "Type"

dataset = restaurantDataset(df_targetType, tfidf_vectorizer, le,
                               use_pca=arg.use_pca, pca=arg.pca,
                               n_components=arg.pca_n_components,
                               standardize=arg.standardize,
                               y=arg.y)

### DataLoader

In [19]:
# Separamos en conjuntos de entrenamiento y de prueba.
arg.random_state = 42
X_train, X_test, y_train, y_test = train_test_split(dataset.X, dataset.y, 
                                                    test_size=0.2, 
                                                    random_state=arg.random_state,
                                                    stratify=dataset.y)

train_data = torch.utils.data.TensorDataset(X_train, y_train)
test_data = torch.utils.data.TensorDataset(X_test, y_test)

print("Longitud del train set", len(train_data))
print("Longitud del test set", len(test_data))

# Dataloaders
arg.batch_size = 128
train_dataloader = DataLoader(train_data, batch_size=arg.batch_size)
test_dataloader = DataLoader(test_data, batch_size=arg.batch_size)

for i, (X, y) in enumerate(test_dataloader):
    print(f"Shape of X [N, C]: {X.shape} {X.dtype}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Longitud del train set 166440
Longitud del test set 41611
Shape of X [N, C]: torch.Size([128, 3]) torch.float32
Shape of y: torch.Size([128]) torch.int64


### Primer modelo - Red Neuronal

In [20]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

class PrimerModelo(nn.Module):
    def __init__(self, input_size, output_size=3):
        super(PrimerModelo, self).__init__()
        self.fc1 = nn.Linear(input_size, 8)
        self.fc2 = nn.Linear(8, 6)
        self.fc3 = nn.Linear(6, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        logits = self.fc3(x)
        return logits
    
    
model = PrimerModelo(input_size=3).to(device)
print(model)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

Using cuda device
PrimerModelo(
  (fc1): Linear(in_features=3, out_features=8, bias=True)
  (fc2): Linear(in_features=8, out_features=6, bias=True)
  (fc3): Linear(in_features=6, out_features=3, bias=True)
)


In [21]:
def train(dataloader, model, loss_fn, optimizer):
    """
    Entrena el modelo con los datos del dataloader y actualiza los pesos del modelo.
    
    Inputs:
    - dataloader: DataLoader con los datos de entrenamiento.
    - model: Modelo a entrenar.
    - loss_fn: Función de pérdida.
    - optimizer: Optimizador.
    """
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [22]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return 100*correct

In [26]:
def save_checkpoint(state, is_best, checkpoint_path, filename="checkpoint.pth",
                    best_filename="model_best.pth"):
    """
    Save the model checkpoint to the specified path.

    Args:
        state (dict): The state of the model to save, typically includes model weights and optimizer state.
        is_best (bool): If True, saves a copy of the checkpoint as "model_best.pth".
        checkpoint_path (str): The directory where the checkpoint will be saved.
        filename (str): The name of the checkpoint file. Default is "checkpoint.pth".

    Returns:
        None
    """
    filepath = os.path.join(checkpoint_path, filename)
    torch.save(state, filepath)
    if is_best:
        shutil.copyfile(filepath, os.path.join(checkpoint_path, best_filename))

In [25]:
# Training hyperparameters
arg.lr = 2.3e-1
arg.epochs = 100
arg.patience = 20

# Scheduler hyperparameters
arg.lr_patience = 10
arg.lr_factor = 0.5 # Se reduce el learning rate a la mitad cada 10 epochs
# sin mejorar el desempeño

# Saving directory
arg.savedir = "../model"
os.makedirs(arg.savedir, exist_ok=True)

# Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode = "max",
    patience=arg.lr_patience,
    factor=arg.lr_factor,
    verbose=True
)



In [27]:
epochs = 1000
best_metric = 0
n_no_improve = 0

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, criterion, optimizer)
    tuning_metric = test(test_dataloader, model, criterion)
    
    # Update the scheduler
    scheduler.step(tuning_metric)
    
    # Save model checkpoint
    is_best = tuning_metric > best_metric
    if is_best:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1
    
    save_checkpoint({
        "epoch": t + 1,
        "state_dict": model.state_dict(),
        "best_metric": best_metric,
        "optimizer": optimizer.state_dict(),
    }, is_best, arg.savedir, 
                    filename="checkpoint_primerModeloGus_std.pth", 
                    best_filename="model-best_primerModeloGus_std.pth")
    
    if n_no_improve >= arg.patience:
        print("No improvement. Breaking out of loop.")
        break
    
print("Ya quedó.")

Epoch 1
-------------------------------
loss: 1.099262  [  128/166440]
loss: 0.397366  [12928/166440]
loss: 0.335156  [25728/166440]
loss: 0.309486  [38528/166440]
loss: 0.328082  [51328/166440]
loss: 0.407508  [64128/166440]
loss: 0.437661  [76928/166440]
loss: 0.358074  [89728/166440]
loss: 0.344479  [102528/166440]
loss: 0.373660  [115328/166440]
loss: 0.351226  [128128/166440]
loss: 0.395701  [140928/166440]
loss: 0.343542  [153728/166440]
loss: 0.240045  [52040/166440]
Test Error: 
 Accuracy: 88.2%, Avg loss: 0.335335 

Epoch 2
-------------------------------
loss: 0.398991  [  128/166440]
loss: 0.388903  [12928/166440]
loss: 0.329759  [25728/166440]
loss: 0.301891  [38528/166440]
loss: 0.313728  [51328/166440]
loss: 0.411297  [64128/166440]
loss: 0.437359  [76928/166440]
loss: 0.361498  [89728/166440]
loss: 0.347695  [102528/166440]
loss: 0.374444  [115328/166440]
loss: 0.343224  [128128/166440]
loss: 0.400902  [140928/166440]
loss: 0.347098  [153728/166440]
loss: 0.244317  [5204