# Fine-tuning con Transformers
Usaremos un modelo Transformer preentrenado para clasificar sentimiento binario y compararemos su desempeño y costos con la red neuronal liviana desarrollada previamente.

In [None]:
!pip -q install transformers datasets accelerate evaluate

In [None]:
import pathlib
import inspect
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate

In [None]:
DATA_PATH = pathlib.Path("..") / "data" / "processed" / "cleaned_sentiment_data.csv"
MODEL_NAME = "bert-base-uncased"  # modelo más liviano para ajustar en GPU pequeña
TEST_SIZE = 0.2
RANDOM_STATE = 42

df = pd.read_csv(DATA_PATH).dropna(subset=["cleaned_review", "sentiment"])
df["sentiment"] = df["sentiment"].astype(int)
df[["cleaned_review", "sentiment"]].head()

In [None]:
train_df, eval_df = train_test_split(
    df[["cleaned_review", "sentiment"]],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=df["sentiment"],
)
train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(batch):
    return tokenizer(
        batch["cleaned_review"],
        truncation=True,
        padding=False,
        max_length=128,  # reducir longitud para aliviar memoria
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_train

In [None]:
cols_to_drop = ["__index_level_0__"] #Eliminar columnas innecesarias si existen
existing_cols = [c for c in cols_to_drop if c in tokenized_train.column_names]
if existing_cols:
    tokenized_train = tokenized_train.remove_columns(existing_cols)
    tokenized_eval = tokenized_eval.remove_columns(existing_cols)

tokenized_train = tokenized_train.rename_column("sentiment", "labels")
tokenized_eval = tokenized_eval.rename_column("sentiment", "labels")
tokenized_train.set_format("torch")
tokenized_eval.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label={0: "negativo", 1: "positivo"},
    label2id={"negativo": 0, "positivo": 1},
)
model.to(device)

supported_training_args = set(inspect.signature(TrainingArguments.__init__).parameters)
training_args_kwargs = {
    "output_dir": "../models/transformer-sentiment",
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 8,
    "num_train_epochs": 3,
    "learning_rate": 2e-5,
    "weight_decay": 0.01,
    "seed": RANDOM_STATE,
}

def set_if_supported(arg: str, value) -> bool:
    if arg in supported_training_args:
        training_args_kwargs[arg] = value
        return True
    return False

set_if_supported("gradient_accumulation_steps", 2)
set_if_supported("warmup_ratio", 0.1)
set_if_supported("logging_steps", 50)
set_if_supported("report_to", "none")
set_if_supported("fp16", torch.cuda.is_available())
set_if_supported("gradient_checkpointing", True)

has_eval_epoch = set_if_supported("evaluation_strategy", "epoch") or set_if_supported("eval_strategy", "epoch")
has_save_epoch = set_if_supported("save_strategy", "epoch")
if not has_save_epoch:
    set_if_supported("save_steps", 500)

set_if_supported("load_best_model_at_end", True)
set_if_supported("metric_for_best_model", "f1")
set_if_supported("greater_is_better", True)

if training_args_kwargs.get("load_best_model_at_end", False):
    if not (has_eval_epoch and has_save_epoch):
        training_args_kwargs["load_best_model_at_end"] = False

training_args = TrainingArguments(**training_args_kwargs)

trainer_kwargs = {
    "model": model,
    "args": training_args,
    "train_dataset": tokenized_train,
    "eval_dataset": tokenized_eval,
}
supported_trainer_args = set(inspect.signature(Trainer.__init__).parameters)
if "tokenizer" in supported_trainer_args:
    trainer_kwargs["tokenizer"] = tokenizer
if "data_collator" in supported_trainer_args:
    trainer_kwargs["data_collator"] = data_collator
if "compute_metrics" in supported_trainer_args:
    trainer_kwargs["compute_metrics"] = compute_metrics

trainer = Trainer(**trainer_kwargs)

In [None]:
train_result = trainer.train()
train_result

In [None]:
eval_metrics = trainer.evaluate()
eval_metrics

In [None]:
trainer.save_model("../models/transformer-sentiment-best")
tokenizer.save_pretrained("../models/transformer-sentiment-best")

In [None]:
label_map = {0: "negativo", 1: "positivo"}

def predict_sentiment(review: str) -> dict:
    encoded = tokenizer(
        review,
        return_tensors="pt",
        truncation=True,
        max_length=256,
        padding=True,
    )
    encoded = {k: v.to(model.device) for k, v in encoded.items()}
    with torch.no_grad():
        logits = model(**encoded).logits
        probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
    pred_label = int(np.argmax(probs))
    return {
        "review": review,
        "label": label_map[pred_label],
        "prob_positive": float(probs[1]),
        "prob_negative": float(probs[0]),
    }

sample_prediction = predict_sentiment("the pacing drags but performances shine")
sample_prediction

## Análisis comparativo
1. Registra `eval_metrics['eval_accuracy']` y `eval_metrics['eval_f1']` tras el fine-tuning.
2. Contrasta esos valores contra la precisión del MLP en [notebooks/model_train_base.ipynb](notebooks/model_train_base.ipynb) para discutir mejoras en recall/F1 vs. costo computacional.
3. Observa el tiempo por época y el uso de GPU (`nvidia-smi`) para cuantificar el sobrecosto del Transformer.
4. Explora ejemplos donde el MLP falló y comprueba si el Transformer corrige esos casos (usa `predict_sentiment`).  
En general, el Transformer debería capturar dependencias largas y matices léxicos que el TF-IDF+MLP ignora, a cambio de un entrenamiento más lento y mayor consumo de memoria.