In [11]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split


In [12]:

# 1. Cargar el dataset
print("Cargando dataset...")
dataset = pd.read_json("../Datasets/dataset_humor_train_embeddings.json", lines=True)

# 2. Extraer y concatenar los embeddings
print("Procesando embeddings...")
we_ft = np.stack(dataset['we_ft'].values) # (N, 300)
we_mx = np.stack(dataset['we_mx'].values) # (N, 300)
we_es = np.stack(dataset['we_es'].values) # (N, 300)


Cargando dataset...
Procesando embeddings...


In [13]:
from torch.utils.data import DataLoader, TensorDataset
def create_minibatches(X, Y, batch_size):
    # Recibe los documentos en X y las etiquetas en Y
    dataset = TensorDataset(X, Y) # Cargar los datos en un dataset de tensores
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    # loader = DataLoader(dataset, batch_size=batch_size)
    return loader

In [14]:
def guardar_resultados(datos, archivo):

    df = pd.DataFrame(datos, columns=['klass'])

    df['id'] = df.index + 1

    df = df[['id', 'klass']]

    df.to_csv(archivo, index=False)

    print(f" {datos} guardado exitosamente!")

In [15]:

X_full = np.concatenate([we_ft, we_mx, we_es], axis=1) 

Y_full = dataset['klass'].to_numpy()

print(f"Dimensión final de entrada: {X_full.shape}")


Dimensión final de entrada: (10400, 900)


In [16]:

# 3. Dividir en Train y Validation
# Usamos stratify para mantener la proporción de humor/no humor
X_tr, X_val, Y_train, Y_val = train_test_split(
    X_full, Y_full, 
    test_size=0.15, 
    random_state=42, 
    stratify=Y_full
)

print(f"Train: {X_tr.shape}, Val: {X_val.shape}")

Train: (8840, 900), Val: (1560, 900)


In [68]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size, output_size, arquitecture):
        super(MLP, self).__init__()
        
        input_size_h1 = arquitecture[0]
        input_size_h2 = arquitecture[1]
        input_size_h3 = arquitecture[2]

        # Capa 1: Recibe muchas características
        self.fc1 = nn.Linear(input_size, input_size_h1)
        self.bn1 = nn.BatchNorm1d(input_size_h1)
        self.act1 = nn.ReLU(0.1)
        self.drop1 = nn.Dropout(0.7)
        
        # Capa 2: Compresión
        self.fc2 = nn.Linear(input_size_h1, input_size_h2)
        self.bn2 = nn.BatchNorm1d(input_size_h2)
        self.act2 = nn.LeakyReLU(0.1)
        self.drop2 = nn.Dropout(0.4)

        # Capa 3: Refinamiento
        self.fc3 = nn.Linear(input_size_h2, input_size_h3)
        self.bn3 = nn.BatchNorm1d(input_size_h3)
        self.act3 = nn.ReLU(0.1)
        self.drop3 = nn.Dropout(0.2)

        # Salida
        self.output = nn.Linear(input_size_h3, output_size)
        
    def forward(self, x):
        x = self.drop1(self.act1(self.bn1(self.fc1(x))))
        x = self.drop2(self.act2(self.bn2(self.fc2(x))))
        x = self.drop3(self.act3(self.bn3(self.fc3(x))))
        x = self.output(x)
        return x

In [18]:
# 1. PREPARAR DATOS

import pandas as pd
import numpy as np

# --- 0. CARGAR EL ARCHIVO DE TEST ---
def test(model):
    print("Cargando dataset de prueba...")
    dataset_test = pd.read_json("../Datasets/dataset_humor_test_embeddings.json", lines=True)

    # --- 1. EXTRACCIÓN Y FUSIÓN DE EMBEDDINGS (CRÍTICO) ---

    print("Procesando vectores...")
    we_ft_test = np.stack(dataset_test['we_ft'].values) # FastText General
    we_mx_test = np.stack(dataset_test['we_mx'].values) # FastText México
    we_es_test = np.stack(dataset_test['we_es'].values) # FastText España


    X_t = np.concatenate([we_ft_test, we_mx_test, we_es_test], axis=1)

    print(f"Dimensiones de Test listas: {X_t.shape}") 


    # Convertir a tensor float
    X_testing = torch.from_numpy(X_t).to(torch.float32)


    device = next(model.parameters()).device 
    X_testing = X_testing.to(device)

    # 2. INFERENCIA
    model.eval() 

    with torch.no_grad(): 
        # Predicción de logits
        y_pred_test_logits = model(X_testing)

    # 3. PROCESAR RESULTADOS
    # Obtener la clase (0 o 1) usando argmax
    y_pred_test_indices = torch.argmax(y_pred_test_logits, dim=1)


    y_pred_final = y_pred_test_indices.cpu().numpy()

    print("Predicciones generadas:")
    print(y_pred_final)


    unique, counts = np.unique(y_pred_final, return_counts=True)
    print("\nConteo de clases predichas:")
    print(dict(zip(unique, counts)))
    return y_pred_final

In [19]:

BATCH_SIZE = 48
LEARNING_RATE = 0.1
EPOCHS = 512 
FACTOR_BALANCE = 1.75

In [69]:
import torch.optim as optim
import torch
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

def ejecutar_experimento(arquitecture, batch_size, lr, epochs, factor_balance = 1.75, patience = 20, guardar = True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Preparar Tensores
    X_train_t = torch.from_numpy(X_tr).float()
    Y_train_t = torch.from_numpy(Y_train).long()
    X_val_t = torch.from_numpy(X_val).float()
    Y_val_tensor = torch.from_numpy(Y_val).long().to(device)

    # Parámetros de la red
    input_size = X_tr.shape[1]
    output_size = 2   # 2 clases

    # Inicializar Modelo
    model = MLP(input_size=input_size, output_size=output_size, arquitecture=arquitecture)
    model.to(device)

    classes = np.unique(Y_train)
    weights_calc = compute_class_weight('balanced', classes=classes, y=Y_train)


    weights_calc[1] = weights_calc[1] * factor_balance
    weights = torch.tensor(weights_calc).float().to(device)

    criterion = nn.CrossEntropyLoss(weight=weights)


    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=4)

    # Variables Early Stopping
    best_f1 = 0.0
    epochs_no_improve = 0
    best_model_path = 'mejor_modelo_fusion.pth'

    print("--- Iniciando Entrenamiento Fusión (900 inputs) ---")

    for epoch in range(epochs + 1):
        model.train()
        dataloader = create_minibatches(X_train_t, Y_train_t, batch_size=batch_size)
        train_loss = 0
        
        for X_b, y_b in dataloader:
            X_b, y_b = X_b.to(device), y_b.to(device)
            
            optimizer.zero_grad()
            output = model(X_b)
            loss = criterion(output, y_b)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
        # --- VALIDACIÓN ---
        model.eval()
        with torch.no_grad():
            X_v = X_val_t.to(device)
            logits = model(X_v)
            val_loss = criterion(logits, Y_val_tensor).item()
            
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            # Métrica Objetivo
            f1_macro = f1_score(Y_val, preds, average='macro')
            acc = accuracy_score(Y_val, preds)
            
            # Métricas por clase para ver si detectamos el humor
            f1_humor = f1_score(Y_val, preds, pos_label=1, average='binary')

        print(f"Ep {epoch+1} | Val Loss: {val_loss:.4f} | F1 Macro: {f1_macro:.4f} (Humor F1: {f1_humor:.4f})")
        
        # Paso del Scheduler
        scheduler.step(val_loss)
        
        # Early Stopping basado en MAXIMIZAR F1 MACRO
        if f1_macro > best_f1:
            best_f1 = f1_macro
            epochs_no_improve = 0
            torch.save(model.state_dict(), best_model_path)
            print(f"  ★ ¡Nuevo Récord! Modelo guardado.")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print("Early Stopping activado.")
                break

    # Cargar el mejor y probar
    model.load_state_dict(torch.load(best_model_path))
    print(f"\nEntrenamiento finalizado. Mejor F1 Macro: {best_f1:.4f}")

    #post entrenamiento
    X_val_t = torch.from_numpy(X_val).to(torch.float32)


    device = next(model.parameters()).device
    X_val_t = X_val_t.to(device)

    # 2. INFERENCIA
    model.eval() 

    with torch.no_grad(): # Ahorra memoria y cálculo
        # Predicción (Logits)
        y_pred_logits = model(X_val_t)
        
        # Obtener la clase con mayor probabilidad (Argmax)
        y_pred_class_tensor = torch.argmax(y_pred_logits, dim=1)

    # 3. POST-PROCESAMIENTO

    y_pred_val_numpy = y_pred_class_tensor.cpu().numpy()

    #print(y_pred_val_numpy)

    #print("--- Matriz de Confusión ---")
    cm = confusion_matrix(Y_val, y_pred_val_numpy)
    #print(cm)

    print("\n--- Reporte de Clasificación ---")

    print(classification_report(Y_val, y_pred_val_numpy, digits=4, zero_division='warn', target_names=['No Humor', 'Humor']))
    y_pred = test(model)
    if(guardar):
        nombre_archivo = f"../Resultados_Leo/neuronas{arquitecture[0]}_{arquitecture[1]}_{arquitecture[2]}, {lr}_900_Embeddings_{batch_size}_balance_{factor_balance}.csv"
        guardar_resultados(y_pred, nombre_archivo)
    



In [70]:
ejecutar_experimento([256, 128, 32], 48, 0.016, 256, patience=40)

--- Iniciando Entrenamiento Fusión (900 inputs) ---
Ep 1 | Val Loss: 0.4307 | F1 Macro: 0.7607 (Humor F1: 0.7230)
  ★ ¡Nuevo Récord! Modelo guardado.
Ep 2 | Val Loss: 0.4291 | F1 Macro: 0.7660 (Humor F1: 0.7314)
  ★ ¡Nuevo Récord! Modelo guardado.
Ep 3 | Val Loss: 0.4719 | F1 Macro: 0.7866 (Humor F1: 0.7271)
  ★ ¡Nuevo Récord! Modelo guardado.
Ep 4 | Val Loss: 0.4089 | F1 Macro: 0.7640 (Humor F1: 0.7360)
Ep 5 | Val Loss: 0.4248 | F1 Macro: 0.7568 (Humor F1: 0.7251)
Ep 6 | Val Loss: 0.4079 | F1 Macro: 0.7682 (Humor F1: 0.7368)
Ep 7 | Val Loss: 0.3996 | F1 Macro: 0.7463 (Humor F1: 0.7250)
Ep 8 | Val Loss: 0.4285 | F1 Macro: 0.8006 (Humor F1: 0.7566)
  ★ ¡Nuevo Récord! Modelo guardado.
Ep 9 | Val Loss: 0.4066 | F1 Macro: 0.7690 (Humor F1: 0.7337)
Ep 10 | Val Loss: 0.4074 | F1 Macro: 0.7974 (Humor F1: 0.7517)
Ep 11 | Val Loss: 0.4050 | F1 Macro: 0.7948 (Humor F1: 0.7534)
Ep 12 | Val Loss: 0.4081 | F1 Macro: 0.7596 (Humor F1: 0.7293)
Ep 13 | Val Loss: 0.4234 | F1 Macro: 0.7850 (Humor F1: 0.

KeyboardInterrupt: 

In [None]:

X_val_t = torch.from_numpy(X_val).to(torch.float32)


device = next(model.parameters()).device
X_val_t = X_val_t.to(device)

# 2. INFERENCIA
model.eval() 

with torch.no_grad(): # Ahorra memoria y cálculo
    # Predicción (Logits)
    y_pred_logits = model(X_val_t)
    
    # Obtener la clase con mayor probabilidad (Argmax)
    y_pred_class_tensor = torch.argmax(y_pred_logits, dim=1)

# 3. POST-PROCESAMIENTO

y_pred_val_numpy = y_pred_class_tensor.cpu().numpy()

print(y_pred_val_numpy)


[0 0 1 ... 0 0 1]


In [82]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print("--- Matriz de Confusión ---")
cm = confusion_matrix(Y_val, y_pred_val_numpy)
print(cm)

print("\n--- Reporte de Clasificación ---")

print(classification_report(Y_val, y_pred_val_numpy, digits=4, zero_division='warn', target_names=['No Humor', 'Humor']))


--- Matriz de Confusión ---
[[849 139]
 [126 446]]

--- Reporte de Clasificación ---
              precision    recall  f1-score   support

    No Humor     0.8708    0.8593    0.8650       988
       Humor     0.7624    0.7797    0.7710       572

    accuracy                         0.8301      1560
   macro avg     0.8166    0.8195    0.8180      1560
weighted avg     0.8310    0.8301    0.8305      1560



In [83]:
import pandas as pd
dataset = pd.read_json("../Datasets/dataset_humor_test.json", lines=True)
#conteo de clases
print("Total de ejemplos de prueba")
print(dataset.klass.value_counts())
# Extracción de los textos en arreglos de numpy
X_test = dataset['text'].to_numpy()
# Extracción de las etiquetas o clases de entrenamiento
Y_test = dataset['klass'].to_numpy()

Total de ejemplos de prueba
klass
-1    5600
Name: count, dtype: int64


In [85]:
def guardar_resultados(datos, archivo):

    df = pd.DataFrame(datos, columns=['klass'])

    df['id'] = df.index + 1

    df = df[['id', 'klass']]

    df.to_csv(archivo, index=False)

    print(f" {datos} guardado exitosamente!")

 [0 1 0 ... 1 0 1] guardado exitosamente!
