In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FusionBiLSTMAttention(nn.Module):
    def __init__(self, vocab_size, hidden_dim, n_layers, 
                 embedding_matrix_mx, embedding_matrix_es, embedding_matrix_neu):
        super(FusionBiLSTMAttention, self).__init__()
        
        # 1. Capas de Embedding (Congeladas o Fine-tuning opcional)
        # Asumimos que recibimos las matrices listas como tensores
        self.embed_mx = nn.Embedding.from_pretrained(embedding_matrix_mx, freeze=False)
        self.embed_es = nn.Embedding.from_pretrained(embedding_matrix_es, freeze=False)
        self.embed_neu = nn.Embedding.from_pretrained(embedding_matrix_neu, freeze=False)
        
        # Dimensión fusionada: 300 + 300 + 300 = 900
        fusion_dim = 300 * 3
        
        # 2. LSTM Bidireccional
        self.lstm = nn.LSTM(fusion_dim, hidden_dim, num_layers=n_layers, 
                            bidirectional=True, batch_first=True, dropout=0.5)
        
        # 3. Mecanismo de Atención
        self.attention_linear = nn.Linear(hidden_dim * 2, 1)
        
        # 4. Clasificador Final
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.dropout = nn.Dropout(0.5)
        
    def attention(self, lstm_output, mask):
        # lstm_output: [batch, seq_len, hidden_dim * 2]
        
        # Calcular pesos de atención
        # weights: [batch, seq_len, 1]
        weights = torch.tanh(self.attention_linear(lstm_output))
        
        # Aplicar máscara (ignorar padding) - asumiendo 0 es pad
        if mask is not None:
            weights = weights.masked_fill(mask.unsqueeze(-1) == 0, -1e9)
            
        attn_weights = F.softmax(weights, dim=1)
        
        # Context vector: suma ponderada
        # context: [batch, hidden_dim * 2]
        context = torch.sum(attn_weights * lstm_output, dim=1)
        return context

    def forward(self, x, mask=None):
        # x shape: [batch, seq_len] (índices de palabras)
        
        # Fusión: Obtener vectores de los 3 dominios y concatenar
        v_mx = self.embed_mx(x)   # [batch, seq, 300]
        v_es = self.embed_es(x)   # [batch, seq, 300]
        v_neu = self.embed_neu(x) # [batch, seq, 300]
        
        # Concatenación (Early Fusion) -> [batch, seq, 900]
        fused_embed = torch.cat((v_mx, v_es, v_neu), dim=2)
        
        # Paso por LSTM
        lstm_out, _ = self.lstm(fused_embed)
        
        # Paso por Atención
        context_vector = self.attention(lstm_out, mask)
        
        # Clasificación
        out = self.dropout(context_vector)
        out = self.fc(out)
        return out

In [7]:
import pandas as pd
dataset = pd.read_json("../Datasets/dataset_humor_train_embeddings.json", lines=True)

In [8]:
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import re

# Limpieza básica (ajusta según tus necesidades)
def tokenizer(text):
    text = text.lower()
    text = re.sub(r'http\S+', '<URL>', text) # Reemplazar URLs
    text = re.sub(r'@\w+', '<USER>', text)   # Reemplazar Usuarios
    return text.split() # Divide por espacios

# 1. Construir el Vocabulario (Mapeo Palabra -> Índice)
print("Construyendo vocabulario desde la columna 'text'...")
all_words = []
for text in dataset['text']:
    all_words.extend(tokenizer(text))

# Contamos las palabras y filtramos las muy raras (opcional, min_freq=1)
word_counts = Counter(all_words)
sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)

# Crear diccionarios
word_to_idx = {'<PAD>': 0, '<UNK>': 1} # 0 para relleno, 1 para desconocidas
for idx, word in enumerate(sorted_vocab, start=2):
    word_to_idx[word] = idx

vocab_size = len(word_to_idx)
print(f"Tamaño del vocabulario (vocab_size): {vocab_size}")

Construyendo vocabulario desde la columna 'text'...
Tamaño del vocabulario (vocab_size): 30731


In [None]:
import torch.optim as optim
from sklearn.metrics import f1_score
import numpy as np

# --- CONFIGURACIÓN ---
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Supongamos que ya cargaste tus jsons y creaste las matrices de embedding
# embeddings_mx = ... (Tensor shape: [Vocab_Size, 300])
# embeddings_es = ...
# embeddings_neu = ...

model = FusionBiLSTMAttention(
    vocab_size=len(vocabulario), # Tu tamaño de vocabulario
    hidden_dim=128,              # 128 suele ser suficiente para tweets
    n_layers=2,
    embedding_matrix_mx=we_mx,
    embedding_matrix_es=we_es,
    embedding_matrix_neu=we_ft
).to(DEVICE)

# --- MANEJO DE DESBALANCE ---
# Calculamos el pos_weight para la clase positiva (1)
pos_weight = torch.tensor([6588 / 3812]).to(DEVICE) # ~1.72
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# --- BUCLE DE ENTRENAMIENTO (Simplificado) ---
def train_epoch(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    
    for batch in iterator:
        text, labels = batch.text.to(DEVICE), batch.label.to(DEVICE)
        # mask crea un tensor de 1s donde hay palabras y 0 donde hay padding
        mask = (text != 0).type(torch.bool) 
        
        optimizer.zero_grad()
        
        predictions = model(text, mask).squeeze(1)
        loss = criterion(predictions, labels.float())
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        # Convertir logits a predicción 0 o 1 usando Sigmoid
        preds_binary = torch.round(torch.sigmoid(predictions)).detach().cpu().numpy()
        all_preds.extend(preds_binary)
        all_labels.extend(labels.cpu().numpy())
        
    f1 = f1_score(all_labels, all_preds)
    return epoch_loss / len(iterator), f1

# Ejemplo de uso:
# for epoch in range(10):
#     train_loss, train_f1 = train_epoch(model, train_loader, optimizer, criterion)
#     print(f'Epoch {epoch+1}: F1 Score Train = {train_f1:.4f}')
#     # Aquí validarías con tu set de prueba