In [1]:
!pip install gensim



In [2]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import random

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!ls "/content/drive/MyDrive/Colab Notebooks/Sistemas Recomendadores/Proyecto"


 behaviors.tsv		  GoogleNews-vectors-negative300.bin
 FastFormerNRMS_2.ipynb   news.tsv
 FastFormerNRMS.ipynb	 'NRMSFastFormer new batches copy.ipynb'


In [5]:
behavior_path = "/content/drive/MyDrive/Colab Notebooks/Sistemas Recomendadores/Proyecto/behaviors.tsv"
news_path = "/content/drive/MyDrive/Colab Notebooks/Sistemas Recomendadores/Proyecto/news.tsv"

In [6]:
df_behaviors = pd.read_csv(behavior_path, sep="\t", names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
df_news = pd.read_csv(news_path, sep="\t", names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'])

In [7]:
df_behaviors.shape

(2232748, 5)

In [8]:
df_behaviors["Time"] = pd.to_datetime(df_behaviors["Time"])
cutoff = pd.to_datetime("2019-11-14")

behavior_train = df_behaviors[df_behaviors["Time"] < cutoff].copy()
behavior_val   = df_behaviors[df_behaviors["Time"] >= cutoff].copy()

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Usando dispositivo: {device}')

Usando dispositivo: cuda


In [10]:
def tokenize(text):
    tokens = re.findall(r"[\w']+", text.lower())
    return tokens

In [11]:
longitudes = df_news["Title"].dropna().apply(lambda x: len(x.split()))
cantidad_menor_20 = (longitudes < 20).sum()
total = len(longitudes)

print(f"Títulos con menos de 20 palabras: {cantidad_menor_20} de {total} ({cantidad_menor_20 / total:.2%})")

Títulos con menos de 20 palabras: 100303 de 101527 (98.79%)


In [12]:
word2idx = {'<PAD>': 0, '<UNK>': 1}
idx = 2 # Por <UNK> y <PAD>
news2idx = {}  # Mapeo: news_id -> lista de índices de palabras (padded/trunc)
max_size_title = 20

In [13]:
for _, row in tqdm(df_news.iterrows(), total=df_news.shape[0]):
    news_id = row["NewsID"]
    title = row["Title"]
    tokens = [] if pd.isna(title) else tokenize(title)
    token_idxs = []
    for w in tokens[:max_size_title]:  # truncar título largo
        if w not in word2idx:
            word2idx[w] = idx
            idx += 1
        token_idxs.append(word2idx.get(w, word2idx['<UNK>']))
    # Rellenar con PAD si es más corto que title_max
    if len(token_idxs) < max_size_title:
        token_idxs += [word2idx['<PAD>']] * (max_size_title - len(token_idxs))
    news2idx[news_id] = token_idxs

100%|██████████| 101527/101527 [00:05<00:00, 17648.54it/s]


In [14]:
vocab_size = len(word2idx)
print(f'Vocabulario: {vocab_size} palabras')

Vocabulario: 50587 palabras


In [16]:
behavior_train

Unnamed: 0,ImpressionID,UserID,Time,History,Impressions
0,1,U87243,2019-11-10 11:30:54,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,2019-11-12 13:45:29,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,2019-11-13 11:23:03,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,2019-11-12 12:24:09,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
5,6,U521853,2019-11-11 10:47:31,N8668 N29136 N128643 N9740 N9375 N52911 N12090...,N32154-0 N67747-0 N47257-0 N98178-1
...,...,...,...,...,...
2232743,2232744,U316192,2019-11-13 18:50:02,N122359 N37069 N95876 N28787 N73408 N11266 N61321,N113723-0 N123683-1 N5287-0 N76677-0 N53474-0
2232744,2232745,U451238,2019-11-12 08:54:06,N12575 N93816 N71643 N87236 N87236,N18861-0 N20990-0 N43085-0 N7937-1
2232745,2232746,U151246,2019-11-13 12:42:51,N27587 N49668,N39887-1 N22811-0 N110709-1 N1923-0 N24001-1 N...
2232746,2232747,U330725,2019-11-12 13:22:57,N121944 N91510 N42280 N60061 N63032 N125223 N4...,N18947-0 N88808-1 N10012-0 N38902-0 N33078-0 N...


In [17]:
from collections import defaultdict

# TRAIN
sessions_train = defaultdict(list)

for _, row in tqdm(behavior_train.iterrows(), total=behavior_train.shape[0]):
    hist_str = row['History']
    hist_ids = [] if pd.isna(hist_str) else hist_str.split()
    impr = row['ImpressionID']
    imps = row['Impressions'].split() if not pd.isna(row['Impressions']) else []
    for imp in imps:
        parts = imp.split('-')
        if len(parts) == 2:
            news_id, click = parts
            sessions_train[impr].append((hist_ids, news_id, int(click)))

100%|██████████| 1801231/1801231 [03:27<00:00, 8662.85it/s] 


In [19]:
print(f'Total impresiones entrenamiento: {len(sessions_train)}')

Total impresiones entrenamiento: 1801231


In [20]:
# VALIDATION
sessions_val = defaultdict(list)

for _, row in tqdm(behavior_val.iterrows(), total=behavior_val.shape[0]):
    hist_str = row['History']
    hist_ids = [] if pd.isna(hist_str) else hist_str.split()
    impr = row['ImpressionID']
    imps = row['Impressions'].split() if not pd.isna(row['Impressions']) else []
    for imp in imps:
        parts = imp.split('-')
        if len(parts) == 2:
            news_id, click = parts
            sessions_val[impr].append((hist_ids, news_id, int(click)))

100%|██████████| 431517/431517 [00:50<00:00, 8487.24it/s] 


In [21]:
print(f'Total impresiones validación: {len(sessions_val)}')

Total impresiones validación: 431517


In [22]:
class MINDListDataset(Dataset):
    def __init__(self, sessions, news2idx, word2idx, hist_max, title_max):
        self.news2idx  = news2idx
        self.word2idx  = word2idx
        self.hist_max  = hist_max
        self.title_max = title_max

        self.sessions = sessions  # dict[impr] = list[(hist, cand, label)]
        self.impr_ids = list(sessions.keys())

    def __len__(self):
        return len(self.impr_ids)

    def __getitem__(self, idx):
        impr = self.impr_ids[idx]
        triples = self.sessions[impr]

        # Historial (compartido para todos los candidatos)
        hist_ids = triples[0][0][-self.hist_max:]
        hist_seq = [self.news2idx.get(nid, [self.word2idx['<PAD>']] * self.title_max) for nid in hist_ids]
        while len(hist_seq) < self.hist_max:
            hist_seq.insert(0, [self.word2idx['<PAD>']] * self.title_max)

        # Candidatos y etiquetas
        cand_seqs = []
        labels = []
        for _, cand_id, lbl in triples:
            cand_seqs.append(self.news2idx.get(cand_id, [self.word2idx['<PAD>']] * self.title_max))
            labels.append(lbl)

        return (torch.tensor(hist_seq,  dtype=torch.long),        # [H,L]
                torch.tensor(cand_seqs, dtype=torch.long),        # [C,L]
                torch.tensor(labels,   dtype=torch.float),        # [C]
                impr)

In [23]:
def collate_fn_list(batch):
    """
    Devuelve:
        hist_batch  : [B, H, L]
        cand_batch  : [B, C_max, L]
        label_batch : [B, C_max]  (0/1, padded con -1)
        mask_batch  : [B, C_max]  (True donde existe candidato)
        impr_batch  : list[str]
    """
    hist_list, cand_list, label_list, impr_list = zip(*batch)

    # Historial: tamaño fijo
    hist_batch = torch.stack(hist_list)             # [B,H,L]

    # Candidatos: pad al máximo C del batch
    C_max = max(x.size(0) for x in cand_list)
    L     = cand_list[0].size(1)
    pad_val = 0  # token PAD

    cand_pad   = torch.full((len(batch), C_max, L), pad_val, dtype=torch.long)
    label_pad  = torch.full((len(batch), C_max),    -1,      dtype=torch.float)
    mask_pad   = torch.zeros(len(batch), C_max,     dtype=torch.bool)

    for i,(cands, labels) in enumerate(zip(cand_list, label_list)):
        C = cands.size(0)
        cand_pad[i,:C]  = cands
        label_pad[i,:C] = labels
        mask_pad[i,:C]  = 1

    return (hist_batch.to(device),
            cand_pad.to(device),
            label_pad.to(device),
            mask_pad.to(device),
            list(impr_list))


In [24]:
max_hist_title = 50
batch_size = 32

In [25]:
train_dataset = MINDListDataset(sessions_train, news2idx, word2idx, max_hist_title, max_size_title)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_list)

val_dataset = MINDListDataset(sessions_val, news2idx, word2idx, max_hist_title, max_size_title)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_list)

In [26]:
embed_dim = 300
num_heads = 20
lr = 0.001

In [27]:
from gensim.models import KeyedVectors

model_path = "/content/drive/MyDrive/Colab Notebooks/Sistemas Recomendadores/Proyecto/GoogleNews-vectors-negative300.bin"  # descomprime el .gz
word2vec = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [28]:
embedding_matrix = np.random.normal(scale=0.6, size=(vocab_size, embed_dim))
found = 0

for word, idx in word2idx.items():
    if word in word2vec:
        embedding_matrix[idx] = word2vec[word]
        found += 1

print(f"Palabras encontradas en Word2Vec: {found}/{vocab_size}")
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float)

Palabras encontradas en Word2Vec: 27886/50587


In [29]:
class FastformerAttention(nn.Module):
    """
    Atención Fastformer (Atención aditiva global) que reemplaza nn.MultiheadAttention.
    Opera con entradas de forma (L, B, E) o (B, L, E), realizando la proyección Q, K, V
    por separado, obteniendo vectores globales y propagando interacciones por producto
    elemento a elemento, según Fastformer (Fastformer: Additive Attention Can Be All You Need).
    """
    def __init__(self, embed_dim, num_heads, dropout=0.0):
        super(FastformerAttention, self).__init__()
        assert embed_dim % num_heads == 0, "embed_dim debe ser divisible por num_heads"
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        # Proyecciones lineales para Q, K, V (similar a multi-cabeza estándar)
        self.W_q = nn.Linear(embed_dim, embed_dim, bias=True)
        self.W_k = nn.Linear(embed_dim, embed_dim, bias=True)
        self.W_v = nn.Linear(embed_dim, embed_dim, bias=True)
        # Parámetros de atención aditiva por cabeza (vectores de peso para Q y K)
        # Formato (num_heads, head_dim) para aplicar dot-product con cada vector de dimensión head_dim
        self.attn_wq = nn.Parameter(torch.Tensor(num_heads, self.head_dim))
        self.attn_wk = nn.Parameter(torch.Tensor(num_heads, self.head_dim))
        # Capa de salida tras concatenar cabezas
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
        self.dropout = nn.Dropout(dropout)
        # Inicialización
        nn.init.xavier_uniform_(self.W_q.weight)
        nn.init.xavier_uniform_(self.W_k.weight)
        nn.init.xavier_uniform_(self.W_v.weight)
        nn.init.xavier_uniform_(self.out_proj.weight)
        nn.init.zeros_(self.W_q.bias)
        nn.init.zeros_(self.W_k.bias)
        nn.init.zeros_(self.W_v.bias)
        nn.init.zeros_(self.out_proj.bias)
        nn.init.xavier_uniform_(self.attn_wq)
        nn.init.xavier_uniform_(self.attn_wk)

    def forward(self, query, key, value):
        """
        query, key, value: tensores de forma (L, B, E) ó (S, N, E) donde
        L=longitud de secuencia, B=batch, E=embed_dim.
        Fastformer es simétrico en q=k=v, pero aceptamos tres argumentos para compatibilidad.
        """
        # Permutar para batch-first: [B, L, E]
        transpose = False
        if query.dim() == 3 and query.shape[0] != query.shape[1]:
            # Asumimos forma (L, B, E)
            query = query.transpose(0, 1)
            key = key.transpose(0, 1)
            value = value.transpose(0, 1)
            transpose = True
        # Proyectar Q, K, V
        # Ahora shapes: [B, L, E]
        Q = self.W_q(query)   # [B, L, E]
        K = self.W_k(key)     # [B, L, E]
        V = self.W_v(value)   # [B, L, E]
        B, L, E = Q.size()
        H = self.num_heads
        D = self.head_dim
        # Dividir en cabezas: [B, L, H, D]
        Q = Q.view(B, L, H, D)
        K = K.view(B, L, H, D)
        V = V.view(B, L, H, D)
        # Reordenar para [B, H, L, D]
        Q = Q.permute(0, 2, 1, 3)
        K = K.permute(0, 2, 1, 3)
        V = V.permute(0, 2, 1, 3)
        # =========== Fastformer Steps ===========
        # 1) Atención aditiva sobre Q para obtener q_global [B, H, D]
        # Calculamos puntuaciones: sum_{j}( w_q[h,j] * Q[...,j] )
        # w_q: [H, D], Q: [B, H, L, D]
        # Producto elemento a elemento y sumar sobre dimensión D:
        # scores_q: [B, H, L]
        scores_q = (Q * self.attn_wq.unsqueeze(0).unsqueeze(2)).sum(dim=-1)  # [B, H, L]
        alpha = torch.softmax(scores_q, dim=-1)  # [B, H, L]
        # Obtener vector q_global: suma ponderada de Q sobre L
        # alpha: [B, H, L], Q: [B, H, L, D] -> q_global: [B, H, D]
        q_global = torch.einsum('bhl,bhld->bhd', alpha, Q)
        # 2) Interactuar q_global con cada K por producto elemento a elemento -> K'
        # Extendemos q_global para cada posición L: [B, H, 1, D] * [B, H, L, D] -> [B, H, L, D]
        K_prime = q_global.unsqueeze(2) * K  # [B, H, L, D]
        # 3) Atención aditiva sobre K_prime para obtener k_global [B, H, D]
        scores_k = (K_prime * self.attn_wk.unsqueeze(0).unsqueeze(2)).sum(dim=-1)  # [B, H, L]
        beta = torch.softmax(scores_k, dim=-1)  # [B, H, L]
        k_global = torch.einsum('bhl,bhld->bhd', beta, K_prime)  # [B, H, D]
        # 4) Interactuar k_global con cada V -> V'
        V_prime = k_global.unsqueeze(2) * V  # [B, H, L, D]
        # Rearmar V' combinando cabezas: [B, H, L, D] -> [B, L, H*D]
        V_prime = V_prime.permute(0, 2, 1, 3).contiguous().view(B, L, H * D)  # [B, L, E]
        # Capa lineal de salida y agregar Q (residuo)
        out = self.out_proj(V_prime)  # [B, L, E]
        # Capa residual: sumamos la proyección original de Q antes de dividir cabezas
        # Primero reconstruir Q original (bidimensional por cada posición)
        Q_orig = Q.permute(0, 2, 1, 3).contiguous().view(B, L, H * D)  # [B, L, E]
        out = out + Q_orig
        # Opcional: aplicar dropout
        out = self.dropout(out)
        # Devolver en forma (L, B, E)
        if transpose:
            out = out.transpose(0, 1).contiguous()
        return out

In [30]:
class NewsEncoder(nn.Module):
    """
    Codificador de noticias: procesa títulos de noticias (secuencias de tokens)
    y produce vectores de noticia. Reemplaza la atención multi-cabeza por FastformerAttention.
    """
    def __init__(self, vocab_size, embed_dim, num_heads, title_max, pretrained_emb=None):
        super(NewsEncoder, self).__init__()
        self.embed_dim = embed_dim
        self.title_max = title_max
        # Capa de embedding de palabras
        self.word_embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        if pretrained_emb is not None:
            self.word_embedding.weight.data.copy_(pretrained_emb)
            self.word_embedding.weight.requires_grad = True  # o False si no quieres fine-tune

        # Capa convolucional 1D para extraer características locales de palabras (opcional, similar a arquitectura original)
        # Usamos múltiples filtros 1xD para captar n-gramas de tamaño 3 por ejemplo
        self.conv = nn.Conv1d(in_channels=embed_dim, out_channels=embed_dim, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        # Atención Fastformer sobre la secuencia de características de palabras
        self.self_attn = FastformerAttention(embed_dim, num_heads, dropout=0.1)
        # Atención aditiva para agregar las palabras importantes en el título
        self.attn_vector = nn.Linear(embed_dim, 1)  # para puntuación de cada palabra
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        """
        x: tensor de tokens de noticias con forma [B, title_max].
        Devuelve: vectores de noticias de forma [B, embed_dim].
        """
        # Embedding y extracción de características locales
        # Palabras: [B, title_max, E] -> conv espera [B, E, title_max]
        emb = self.word_embedding(x)            # [B, L, E]
        emb = emb.transpose(1, 2)               # [B, E, L]
        conv_out = self.relu(self.conv(emb))    # [B, E, L]
        conv_out = conv_out.transpose(1, 2)     # [B, L, E]
        # Atención Fastformer (auto-atención) entre las posiciones de palabras
        # FastformerAttention espera (L, B, E) o (B, L, E); adaptamos:
        conv_out_trans = conv_out.transpose(0, 1).contiguous()  # [L, B, E]
        attn_out = self.self_attn(conv_out_trans, conv_out_trans, conv_out_trans)  # [L, B, E]
        attn_out = attn_out.transpose(0, 1)  # [B, L, E]
        # Atención aditiva para obtener vector final de noticia
        # Calcular puntuación de importancia para cada palabra
        scores = self.attn_vector(attn_out).squeeze(-1)  # [B, L]
        weights = self.softmax(scores)                    # [B, L]
        news_vector = torch.bmm(weights.unsqueeze(1), attn_out).squeeze(1)  # [B, E]
        return news_vector  # [B, embed_dim]

class UserEncoder(nn.Module):
    """
    Codificador de usuario: agrega vectores de noticias historiales usando Fastformer.
    Toma un historial de noticias y devuelve un vector de usuario.
    """
    def __init__(self, news_encoder, embed_dim, num_heads, hist_max):
        super(UserEncoder, self).__init__()
        self.news_encoder = news_encoder  # instancia de NewsEncoder para codificar cada noticia
        self.hist_max = hist_max
        # Atención Fastformer sobre la secuencia de vectores de noticia del historial
        self.self_attn = FastformerAttention(embed_dim, num_heads, dropout=0.1)
        # Atención aditiva para agregar las noticias más relevantes del historial
        self.attn_vector = nn.Linear(embed_dim, 1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, hist_x):
        """
        hist_x: tensor de tokens de noticias de historial con forma [B, hist_max, title_max].
        Devuelve: vector de usuario de forma [B, embed_dim].
        """
        B, H, L = hist_x.size()
        # Codificar cada noticia en el historial
        hist_x_flat = hist_x.view(B * H, L)                  # [B*H, title_max]
        news_vectors = self.news_encoder(hist_x_flat)        # [B*H, embed_dim]
        news_vectors = news_vectors.view(B, H, -1)           # [B, hist_max, embed_dim]
        # Atención Fastformer sobre las noticias del historial
        nv_trans = news_vectors.transpose(0, 1).contiguous() # [H, B, E]
        attn_out = self.self_attn(nv_trans, nv_trans, nv_trans)  # [H, B, E]
        attn_out = attn_out.transpose(0, 1)                  # [B, H, E]
        # Atención aditiva para agregar vectores de noticias importantes
        scores = self.attn_vector(attn_out).squeeze(-1)      # [B, H]
        weights = self.softmax(scores)                       # [B, H]
        user_vector = torch.bmm(weights.unsqueeze(1), attn_out).squeeze(1)  # [B, E]
        return user_vector  # [B, embed_dim]

In [31]:
class FastformerNRMS(nn.Module):
    """
    Modelo NRMS modificado con Fastformer.
    Mantiene la misma interfaz: forward(hist_tensor, cand_tensor).
    hist_tensor: [B, hist_max, title_max], cand_tensor: [B, cand_count, title_max].
    """
    def __init__(self, vocab_size, embed_dim, num_heads, title_max, hist_max, pretrained_emb=None):
        super(FastformerNRMS, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.title_max = title_max
        self.hist_max = hist_max
        # News encoder y user encoder con Fastformer
        self.news_encoder = NewsEncoder(vocab_size, embed_dim, num_heads, title_max, pretrained_emb)
        self.user_encoder = UserEncoder(self.news_encoder, embed_dim, num_heads, hist_max)
        # (Opcional) proyección final o dropout
        self.dropout = nn.Dropout(0.1)

    def forward(self, hist_tensor, cand_tensor, mask=None):
        """
        hist_tensor : [B, hist_max, title_max]
        cand_tensor : [B, C,       title_max]
        mask        : [B, C]  (bool) – True donde el candidato existe
        """
        B, H, L = hist_tensor.size()
        _, C, _ = cand_tensor.size()
        # Codificar usuario
        user_vector = self.user_encoder(hist_tensor)            # [B, E]
        # Codificar candidatos (aplicar NewsEncoder a cada candidato)
        cand_flat = cand_tensor.view(B * C, L)                  # [B*C, title_max]
        cand_vecs = self.news_encoder(cand_flat)               # [B*C, E]
        cand_vecs = cand_vecs.view(B, C, -1)                   # [B, cand_count, E]
        # Calcular similaridad (producto punto usuario con cada candidato)
        # Expandir user_vector para combinar con candidatos

        cand_vecs = self.dropout(cand_vecs)
        user_vector = self.dropout(user_vector)

        user_exp = user_vector.unsqueeze(1)                    # [B, 1, E]
        logits = torch.sum(cand_vecs * user_exp, dim=-1)       # [B, cand_count]

        if mask is not None:
            logits = logits.masked_fill(~mask, -1e9)             # -∞ donde no hay candidato

        return logits                                            # [B,C]

In [32]:
model = FastformerNRMS(vocab_size, embed_dim, num_heads, max_size_title, max_hist_title,
             pretrained_emb=embedding_matrix.to(device) if embedding_matrix is not None else None)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [33]:
from sklearn.metrics import roc_auc_score

def ndcg_score(labels, scores, k=5):
    order = np.argsort(scores)[::-1]
    labels = np.array(labels)
    dcg = 0.0
    for i in range(min(k, len(labels))):
        rel = labels[order[i]]
        dcg += (2**rel - 1) / np.log2(i+2)
    ideal = np.sort(labels)[::-1]
    idcg = 0.0
    for i in range(min(k, int(np.sum(labels)))):
        idcg += 1.0 / np.log2(i+2)
    return dcg / idcg if idcg > 0 else 0.0

def mrr_score(labels, scores):
    order = np.argsort(scores)[::-1]
    labels = np.array(labels)[order]
    for rank, label in enumerate(labels, start=1):
        if label == 1:
            return 1.0 / rank
    return 0.0

In [34]:
epochs = 1

In [35]:
best_ndcg5 = 0.0
best_model_state = None

for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0.0
    for hist_batch, cand_batch, label_batch, mask_batch, _ in tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}"):
        optimizer.zero_grad()
        logits = model(hist_batch, cand_batch, mask_batch)
        target = label_batch.argmax(dim=1)        # [B]
        loss = criterion(logits, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch} - Pérdida promedio: {avg_loss:.4f}")

    # Evaluación en validación
    if val_loader is None:
        continue

    model.eval()
    ndcg5_list, ndcg10_list, mrr_list, auc_list = [], [], [], []

    with torch.no_grad():
        for hist_batch, cand_batch, label_batch, mask_batch, impr_batch in val_loader:
            logits  = model(hist_batch, cand_batch, mask_batch)  # [B,C]
            scores  = logits.cpu().numpy()
            labels  = label_batch.cpu().numpy()
            masks   = mask_batch.cpu().numpy()

            for s, l, m in zip(scores, labels, masks):
                # recortar a candidatos reales
                s = s[m]        # (C_real,)
                l = l[m]        # (C_real,)

                ndcg5_list .append(ndcg_score(l, s, k=5))
                ndcg10_list.append(ndcg_score(l, s, k=10))
                mrr_list  .append(mrr_score(l, s))
                if l.max() > 0 and l.min() == 0:     # al menos 1 pos y 1 neg
                    auc_list.append(roc_auc_score(l, s))

    ndcg5  = np.mean(ndcg5_list)
    ndcg10 = np.mean(ndcg10_list)
    mrr    = np.mean(mrr_list)
    auc    = np.mean(auc_list) if auc_list else 0.0

    if ndcg5 > best_ndcg5:
        best_ndcg5 = ndcg5
        best_model_state = model.state_dict()
        print(f"» Nuevo mejor modelo guardado (nDCG@5 = {ndcg5:.4f})")

    print(f"Validación – AUC: {auc:.4f} | MRR: {mrr:.4f} | "
          f"nDCG@5: {ndcg5:.4f} | nDCG@10: {ndcg10:.4f}")

if best_model_state is not None:
    torch.save(best_model_state, "nrms_fastformer_best_word2vec.pt")
    print("Modelo con mejor nDCG@5 guardado en nrms_fastformer_best_word2vec.pt")

Epoch 1/1: 100%|██████████| 56289/56289 [3:57:17<00:00,  3.95it/s]


Epoch 1 - Pérdida promedio: 924497.4263
» Nuevo mejor modelo guardado (nDCG@5 = 0.3269)
Validación – AUC: 0.6642 | MRR: 0.3506 | nDCG@5: 0.3269 | nDCG@10: 0.3882
Modelo con mejor nDCG@5 guardado en nrms_fastformer_best_word2vec.pt
