In [None]:
#!pip install pysentimiento transformers torch scikit-learn pandas numpy 
#Instalar en un nuevo entorno por si acaso 
#En colab copiar tal cual

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel  # AutoModel es mejor para RoBERTa
from torch.optim import AdamW # Optimizador nativo
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from pysentimiento.preprocessing import preprocess_tweet 

In [None]:
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Deterministic for cuDNN (puede bajar rendimiento)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
# ==========================================
# 1. MODELO: RoBERTuito con Mean-Max Pooling
# ==========================================
class HumorRoBERTuito_MM(nn.Module):
    def __init__(self, freeze_base=False):
        super(HumorRoBERTuito_MM, self).__init__()
        
        print("Cargando RoBERTuito (Base)...")
        # Usamos el modelo de robertito
        self.roberta = AutoModel.from_pretrained('pysentimiento/robertuito-base-cased')
        
        if freeze_base:
            for param in self.roberta.parameters():
                param.requires_grad = False
        
        # Dimensiones: RoBERTuito Base = 768. M-M Pooling = 768 * 2 = 1536
        self.input_dim = 768 * 2 
        
        self.classifier = nn.Sequential(
            nn.Linear(self.input_dim, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(128, 2)
        )

    def forward(self, input_ids, attention_mask):
        # RoBERTa devuelve un objeto 
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        # (Batch, Secuencia, 768)
        hidden_state = outputs.last_hidden_state 
        
        # --- M-M POOLING ---
        mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
        
        # Mean Pooling
        sum_embeddings = torch.sum(hidden_state * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        mean_pooled = sum_embeddings / sum_mask
        
        # Max Pooling (rellenamos padding con valor muy bajo)
        hidden_state[mask_expanded == 0] = -1e9
        max_pooled = torch.max(hidden_state, 1)[0]
        
        # Concatenar
        concat_vector = torch.cat((mean_pooled, max_pooled), 1)
        
        # Clasificar
        logits = self.classifier(concat_vector)
        
        return logits

In [None]:
# ==========================================
# 2. DATASET (Con Preprocessing de Pysentimiento)
# ==========================================
class HumorTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        
        # PREPROCESAMIENTO OFICIAL DE ROBERTUITO
        # Esto normaliza usuarios a "@usuario", urls a "url", risas a "jaja", etc.
        text = preprocess_tweet(text, lang="es") 
        
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# ==========================================
# 3. PREPARACIÓN Y ENTRENAMIENTO
# ==========================================

# Parámetros
MAX_LEN = 100 
BATCH_SIZE = 16 # RoBERTuito + Custom Head consume memoria, 16 es seguro
EPOCHS = 4
LR = 2e-5 

# Cargar Datos
print("Leyendo datos...")
# Asegúrate de que la ruta sea correcta
df = pd.read_json("./Datasets/dataset_humor_train.json", lines=True) 

# Extracción
textos = df['text'].values 
etiquetas = df['klass'].values

# Split
X_train, X_val, y_train, y_val = train_test_split(
    textos, etiquetas, test_size=0.15, stratify=etiquetas, random_state=42
)
# Tokenizer (Usamos AutoTokenizer)
tokenizer = AutoTokenizer.from_pretrained('pysentimiento/robertuito-base-cased')

# Datasets
train_dataset = HumorTextDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = HumorTextDataset(X_val, y_val, tokenizer, MAX_LEN)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)



In [None]:

# Configuración Dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# Instanciar Modelo
model = HumorRoBERTuito_MM(freeze_base=False)
model = model.to(device)

# Optimizador y Scheduler
optimizer = AdamW(model.parameters(), lr=LR) # Nota: correct_bias=False no es necesario aquí
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss()

best_f1 = 0.0
best_model_path = 'mejor_modelo_fusion.pth'

# --- BUCLE DE ENTRENAMIENTO ---
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)
    
    # Entrenamiento
    model.train()
    losses = []
    
    for step, batch in enumerate(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["labels"].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
    print(f"Train loss: {np.mean(losses)}")
    # Validación
    model.eval()
    predictions = []
    real_values = []
    val_losses = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            targets = batch["labels"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            loss = loss_fn(outputs, targets)
            val_losses.append(loss.item())
            
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            real_values.extend(targets.cpu().tolist())
    
    val_f1 = f1_score(real_values, predictions, average='macro') # 'weighted' o 'binary' según necesites
    print(f"Val Loss: {np.mean(val_losses):.4f}")
    print(f"Val F1 Macro: {val_f1:.4f}")

    # Guardado
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), best_model_path)
        print(f"¡Nuevo Récord! Modelo guardado.")

model.load_state_dict(torch.load(best_model_path))
print(f"\nEntrenamiento finalizado. Mejor F1 Macro: {best_f1:.4f}")

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from pysentimiento.preprocessing import preprocess_tweet # <--- IMPORTANTE

# --- 0. CARGAR EL ARCHIVO DE TEST ---
print("Cargando dataset de prueba...")
dataset_test = pd.read_json("./Datasets/dataset_humor_test.json", lines=True)

# Obtenemos los textos crudos
raw_texts = dataset_test['text'].tolist()

# --- 1. PREPROCESAMIENTO Y TOKENIZACIÓN ---
print("Preprocesando y tokenizando textos...")

# A) APLICAR PREPROCESAMIENTO 
textos_test = [preprocess_tweet(t, lang="es") for t in raw_texts]

# B) Tokenizar
tokens_test = tokenizer(
    textos_test,
    add_special_tokens=True,
    max_length=MAX_LEN,      # Mismo que en entrenamiento
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

input_ids_test = tokens_test['input_ids']
attention_masks_test = tokens_test['attention_mask']

# --- 2. PREPARAR DATALOADER ---
dataset_torch_test = TensorDataset(input_ids_test, attention_masks_test)
dataloader_test = DataLoader(dataset_torch_test, batch_size=32, shuffle=False)

# --- 3. INFERENCIA ---
print("Iniciando inferencia...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
model.eval()

predictions_list = []

with torch.no_grad():
    for batch in dataloader_test:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)

        logits = model(b_input_ids, attention_mask=b_input_mask)

        # Obtener índices (0 o 1)
        batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions_list.extend(batch_preds)

y_pred_final = np.array(predictions_list)

print("Predicciones generadas.")
unique, counts = np.unique(y_pred_final, return_counts=True)
print("Conteo de clases predichas:", dict(zip(unique, counts)))


In [None]:
def guardar_resultados(datos, archivo):
    df = pd.DataFrame(datos, columns=['klass'])
    df['id'] = df.index + 1
    df = df[['id', 'klass']]

    df.to_csv(archivo, index=False)
    print(f"Archivo guardado exitosamente: {archivo}")

In [None]:
# Define el nombre del archivo
nombre_archivo = f"robertuito_predicciones_LR{LR}_EPOCHS{EPOCHS}.csv"
guardar_resultados(y_pred_final, nombre_archivo)