In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import tqdm
import re
import string
import nltk
from nltk.corpus import stopwords
import shutil

from argparse import Namespace
arg = Namespace()

import sys
import os

if os.path.abspath("../src") not in sys.path:
    sys.path.append(os.path.abspath("../src"))

import preprocessing as pp
import config
import importlib

importlib.reload(pp)

<module 'preprocessing' from 'c:\\Users\\Gus\\Documents\\proyectos\\REST-MEX-2025\\src\\preprocessing.py'>

In [2]:
train_set = pd.read_csv(config.TRAIN_FILE)
train_set.head()

Unnamed: 0,Title,Review,Polarity,Town,Region,Type
0,Mi Lugar Favorito!!!!,Excelente lugar para comer y pasar una buena n...,5.0,Sayulita,Nayarit,Restaurant
1,lugares interesantes para visitar,"andar mucho, así que un poco difícil para pers...",4.0,Tulum,QuintanaRoo,Attractive
2,No es el mismo Dreams,"Es nuestra cuarta visita a Dreams Tulum, elegi...",3.0,Tulum,QuintanaRoo,Hotel
3,un buen panorama cerca de CancÃºn,"Estando en CancÃºn, fuimos al puerto y tomamos...",4.0,Isla_Mujeres,QuintanaRoo,Attractive
4,El mejor,Es un lugar antiguo y por eso me encanto tiene...,5.0,Patzcuaro,Michoacan,Hotel


In [3]:
X = train_set.drop(columns=config.TARGETS)
y1 = train_set[config.TARGET1] # Polarity
y2 = train_set[config.TARGET2] # Town
y3 = train_set[config.TARGET3] # Type

X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42, stratify=y1)
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42, stratify=y2)
X3_train, X3_test, y3_train, y3_test = train_test_split(X, y3, test_size=0.2, random_state=42, stratify=y3)

### Preprocesado

In [4]:
X3_train = pp.pp_pipeline.fit_transform(X3_train)

In [11]:
X3_train

Unnamed: 0,Texto_Limpio
52557,tan maravill jueg disneyland lleg azulik tod e...
706,visit isla janitzi maravill ir descubr hermos ...
75004,desayun estupend frut tortit huev ofrec dol fa...
6629,excelent cad vez visit oblig cad vez vam sayul...
200700,hotel agrad excelent ubic desayun gratis bonit...
...,...
77286,total perd disfrut asi dirig cancun chich itza...
50959,normal ser numer tripadvisor esper cos mal ve ...
7066,mejor mexican cenot simplement increibl hesist...
145830,sueñ realid resort inclu simplement increibl p...


In [6]:
df3 = pd.concat([X3_train, y3_train], axis=1)

In [7]:
# Vectorizer TFIDF
arg.tfidf_max_features = 5000
arg.tfidf_ngram_range = (1, 2)
arg.token_pattern=r'(?u)\b[^\d\W]+\b'

tfidf_vectorizer = TfidfVectorizer(max_features=arg.tfidf_max_features, 
                                   ngram_range=arg.tfidf_ngram_range, 
                                   token_pattern=arg.token_pattern)


### Cargamos en un Dataset

In [8]:
class restaurantDataset(Dataset):
    def __init__(self, data : pd.DataFrame, vectorizer, label_encoder,
                 y : str, standardize = False):
        """
        Initializes the restaurantDataset class.

        Args:
            data (pd.DataFrame): The input dataframe containing the text and labels.
            vectorizer (TfidfVectorizer): The TF-IDF vectorizer to transform text data.
            label_encoder (LabelEncoder): The label encoder to transform labels into numerical format.
            y (str): The column name in the dataframe representing the target labels.
            use_pca (bool, optional): Whether to apply PCA for dimensionality reduction. Default is False.
            pca (PCA, optional): The PCA object to use for dimensionality reduction. Required if use_pca is True.
            n_components (int, optional): The number of principal components to retain if PCA is applied. Default is 3.

        Attributes:
            data (pd.DataFrame): The input dataframe.
            n_samples (int): The number of samples in the dataset.
            X (torch.Tensor): The transformed feature matrix (TF-IDF or PCA-reduced).
            y (torch.Tensor): The transformed target labels.
            fitted_pca (PCA or None): The fitted PCA object if PCA is applied, otherwise None.

        Returns:
            None
        """
        self.data = data
        self.n_samples = len(data)
        
        # Transform text to TF-IDF vectors
        tfidf_matrix = vectorizer.transform(data[config.NEW_COLUMN])
        self.X = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32)

        if standardize:
            scaler = StandardScaler()
            self.X_np = self.X.numpy()
            self.X = torch.tensor(scaler.fit_transform(self.X_np), dtype=torch.float32)
        
        # Transform labels to numbers
        self.y = torch.tensor(label_encoder.fit_transform(data[y]),
                              dtype=torch.long)

    def __len__(self):
        return self.n_samples

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [9]:
arg.standardize = True
arg.y = "Type"

tfidf_vectorizer.fit(df3[config.NEW_COLUMN])
type_label_encoder = LabelEncoder()

dataset = restaurantDataset(df3, tfidf_vectorizer, type_label_encoder, arg.y,
                            arg.standardize)

### DataLoader

In [12]:
# Separamos en conjuntos de entrenamiento y de prueba.
arg.random_state = 42
X_train, X_test, y_train, y_test = train_test_split(dataset.X, dataset.y, 
                                                    test_size=0.2, 
                                                    random_state=arg.random_state,
                                                    stratify=dataset.y)

train_data = torch.utils.data.TensorDataset(X_train, y_train)
test_data = torch.utils.data.TensorDataset(X_test, y_test)

print("Longitud del train set", len(train_data))
print("Longitud del test set", len(test_data))

# Dataloaders
arg.batch_size = 128
train_dataloader = DataLoader(train_data, batch_size=arg.batch_size)
test_dataloader = DataLoader(test_data, batch_size=arg.batch_size)

for i, (X, y) in enumerate(test_dataloader):
    print(f"Shape of X [N, C]: {X.shape} {X.dtype}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Longitud del train set 133152
Longitud del test set 33288
Shape of X [N, C]: torch.Size([128, 5000]) torch.float32
Shape of y: torch.Size([128]) torch.int64


### Primer modelo - Red Neuronal

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

class TfidfClassifier(nn.Module):
    def __init__(self, input_size, output_size=3, dropout=0.3):
        super(TfidfClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        logits = self.fc3(x)
        return logits

arg.dropout = 0.2
arg.lr = 0.01
model = TfidfClassifier(input_size=5000, dropout=arg.dropout).to(device)
print(model)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=arg.lr)

Using cuda device
TfidfClassifier(
  (fc1): Linear(in_features=5000, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=3, bias=True)
)


In [14]:
def train(dataloader, model, loss_fn, optimizer):
    """
    Entrena el modelo con los datos del dataloader y actualiza los pesos del modelo.
    
    Inputs:
    - dataloader: DataLoader con los datos de entrenamiento.
    - model: Modelo a entrenar.
    - loss_fn: Función de pérdida.
    - optimizer: Optimizador.
    """
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [15]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return 100*correct

In [16]:
def save_checkpoint(state, is_best, checkpoint_path, filename="checkpoint.pth",
                    best_filename="model_best.pth"):
    """
    Save the model checkpoint to the specified path.

    Args:
        state (dict): The state of the model to save, typically includes model weights and optimizer state.
        is_best (bool): If True, saves a copy of the checkpoint as "model_best.pth".
        checkpoint_path (str): The directory where the checkpoint will be saved.
        filename (str): The name of the checkpoint file. Default is "checkpoint.pth".

    Returns:
        None
    """
    filepath = os.path.join(checkpoint_path, filename)
    torch.save(state, filepath)
    if is_best:
        shutil.copyfile(filepath, os.path.join(checkpoint_path, best_filename))

In [17]:
# Training hyperparameters
arg.lr = 1e-2
arg.epochs = 100
arg.patience = 20

# Scheduler hyperparameters
arg.lr_patience = 10
arg.lr_factor = 0.5 # Se reduce el learning rate a la mitad cada 10 epochs
# sin mejorar el desempeño

# Saving directory
arg.savedir = "../model"
os.makedirs(arg.savedir, exist_ok=True)

# Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode = "max",
    patience=arg.lr_patience,
    factor=arg.lr_factor
)

In [18]:
epochs = 1000
best_metric = 0
n_no_improve = 0

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, criterion, optimizer)
    tuning_metric = test(test_dataloader, model, criterion)
    
    # Update the scheduler
    scheduler.step(tuning_metric)
    
    # Save model checkpoint
    is_best = tuning_metric > best_metric
    if is_best:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1
    
    save_checkpoint({
        "epoch": t + 1,
        "state_dict": model.state_dict(),
        "best_metric": best_metric,
        "optimizer": optimizer.state_dict(),
    }, is_best, arg.savedir, 
                    filename="checkpoint_on_stemm_5000_tfidf.pth", 
                    best_filename="model-best_on_stemm_5000_tfidf.pth")
    
    if n_no_improve >= arg.patience:
        print("No improvement. Breaking out of loop.")
        break
    
print("Ya quedó.")

Epoch 1
-------------------------------
loss: 1.098794  [  128/133152]
loss: 0.288518  [12928/133152]
loss: 0.263353  [25728/133152]
loss: 0.145743  [38528/133152]
loss: 0.210210  [51328/133152]
loss: 0.113453  [64128/133152]
loss: 0.277052  [76928/133152]
loss: 0.193696  [89728/133152]
loss: 0.302888  [102528/133152]
loss: 0.275967  [115328/133152]
loss: 0.221592  [128128/133152]
Test Error: 
 Accuracy: 91.9%, Avg loss: 0.229724 

Epoch 2
-------------------------------
loss: 0.290687  [  128/133152]
loss: 0.200542  [12928/133152]
loss: 0.198060  [25728/133152]
loss: 0.054287  [38528/133152]
loss: 0.159539  [51328/133152]
loss: 0.069764  [64128/133152]
loss: 0.449665  [76928/133152]
loss: 0.099574  [89728/133152]
loss: 0.555347  [102528/133152]
loss: 0.409924  [115328/133152]
loss: 0.295210  [128128/133152]
Test Error: 
 Accuracy: 88.6%, Avg loss: 0.773888 

Epoch 3
-------------------------------
loss: 0.660970  [  128/133152]
loss: 0.349860  [12928/133152]
loss: 0.284247  [25728/133

KeyboardInterrupt: 

In [59]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

logreg = OneVsRestClassifier(LogisticRegression(max_iter=1000, solver="liblinear"))
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred, target_names=type_label_encoder.classes_))

              precision    recall  f1-score   support

  Attractive       0.95      0.96      0.95     11187
       Hotel       0.95      0.92      0.93      8226
  Restaurant       0.95      0.96      0.95     13875

    accuracy                           0.95     33288
   macro avg       0.95      0.95      0.95     33288
weighted avg       0.95      0.95      0.95     33288



In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix
from sklearn import metrics

# Parámetro de complejidad del SVM, se proponen estos
# y se recorrerán con GridSearch
parameters = {"C": [.05, .12, .25, .5, 1, 2, 4]}   
# Tratar de penalizar con base a la proporción de ejemplos
# en cada clase
svr = svm.LinearSVC(class_weight='balanced', max_iter=10000)
grid = GridSearchCV(estimator=svr, param_grid=parameters,
                     n_jobs=6, scoring="f1_macro", cv=5)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred, pos_label='1'):.4f}")

