In [1]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
df_behaviors = pd.read_csv("behaviors.tsv", sep="\t", names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
df_news = pd.read_csv("news.tsv", sep="\t", names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'])

In [3]:
df_behaviors.shape

(156965, 5)

In [4]:
df_behaviors["Time"] = pd.to_datetime(df_behaviors["Time"])
cutoff = pd.to_datetime("2019-11-14")

behavior_train = df_behaviors[df_behaviors["Time"] < cutoff].copy()
behavior_val   = df_behaviors[df_behaviors["Time"] >= cutoff].copy()

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Usando dispositivo: {device}')

Usando dispositivo: cuda


In [6]:
def tokenize(text):
    tokens = re.findall(r"[\w']+", text.lower())
    return tokens

In [7]:
longitudes = df_news["Title"].dropna().apply(lambda x: len(x.split()))
cantidad_menor_20 = (longitudes < 20).sum()
total = len(longitudes)

print(f"Títulos con menos de 20 palabras: {cantidad_menor_20} de {total} ({cantidad_menor_20 / total:.2%})")

Títulos con menos de 20 palabras: 50633 de 51282 (98.73%)


In [8]:
word2idx = {'<PAD>': 0, '<UNK>': 1}
idx = 2 # Por <UNK> y <PAD>
news2idx = {}  # Mapeo: news_id -> lista de índices de palabras (padded/trunc)
max_size_title = 20

In [9]:
for _, row in tqdm(df_news.iterrows(), total=df_news.shape[0]):
    news_id = row["NewsID"]
    title = row["Title"]
    tokens = [] if pd.isna(title) else tokenize(title)
    token_idxs = []
    for w in tokens[:max_size_title]:  # truncar título largo
        if w not in word2idx:
            word2idx[w] = idx
            idx += 1
        token_idxs.append(word2idx.get(w, word2idx['<UNK>']))
    # Rellenar con PAD si es más corto que title_max
    if len(token_idxs) < max_size_title:
        token_idxs += [word2idx['<PAD>']] * (max_size_title - len(token_idxs))
    news2idx[news_id] = token_idxs

100%|██████████| 51282/51282 [00:03<00:00, 17020.89it/s]


In [10]:
vocab_size = len(word2idx)
print(f'Vocabulario: {vocab_size} palabras')

Vocabulario: 37272 palabras


In [11]:
data = []
for _, row in tqdm(behavior_train.iterrows(), total=behavior_train.shape[0]):
    hist_str = row['History']
    hist_ids = [] if pd.isna(hist_str) else [nid for nid in hist_str.split() if nid]
    impr = row['ImpressionID']
    imps = [] if pd.isna(row['Impressions']) else row['Impressions'].split()
    for imp in imps:
        if len(imp) == 0:
            continue
        parts = imp.split('-')
        if len(parts) != 2:
            continue
        news_id, click = parts[0], parts[1]
        label = int(click)
        data.append((impr, hist_ids, news_id, label))

100%|██████████| 126695/126695 [00:18<00:00, 6773.08it/s] 


In [12]:
data = data[:460000]

In [13]:
print(f'Total de ejemplos de interacción: {len(data)}')

Total de ejemplos de interacción: 460000


In [14]:
val_data = []

for _, row in tqdm(behavior_val.iterrows(), total=behavior_val.shape[0]):
    hist_str = row['History']
    hist_ids = [] if pd.isna(hist_str) else [nid for nid in hist_str.split() if nid]
    impr = row['ImpressionID']
    imps = row['Impressions'].split()
    for imp in imps:
        if len(imp) == 0:
            continue
        parts = imp.split('-')
        if len(parts) != 2:
            continue
        news_id, click = parts[0], parts[1]
        val_data.append((impr, hist_ids, news_id, int(click)))

100%|██████████| 30270/30270 [00:04<00:00, 6078.06it/s]


In [15]:
val_data = val_data[:120000]

In [16]:
print(f'Total ejemplos validación: {len(val_data)}')

Total ejemplos validación: 120000


In [17]:
class MINDDataset(Dataset):
    def __init__(self, interactions, news2idx, word2idx, hist_max, title_max):
        self.interactions = interactions
        self.news2idx = news2idx
        self.word2idx = word2idx
        self.hist_max = hist_max
        self.title_max = title_max
    def __len__(self):
        return len(self.interactions)
    def __getitem__(self, idx):
        impr, hist_ids, cand_id, label = self.interactions[idx]
        # Truncar o pad historial
        if len(hist_ids) > self.hist_max:
            hist_ids = hist_ids[-self.hist_max:]
        hist_seq = []
        for nid in hist_ids:
            seq = self.news2idx.get(nid, [self.word2idx['<PAD>']] * self.title_max)
            hist_seq.append(seq)
        if len(hist_seq) < self.hist_max:
            pad_seq = [self.word2idx['<PAD>']] * self.title_max
            for _ in range(self.hist_max - len(hist_seq)):
                hist_seq.insert(0, pad_seq)
        # Noticia candidata
        cand_seq = self.news2idx.get(cand_id, [self.word2idx['<PAD>']] * self.title_max)
        # Convertir a tensores
        hist_tensor = torch.tensor(hist_seq, dtype=torch.long)
        cand_tensor = torch.tensor(cand_seq, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.float)
        return hist_tensor, cand_tensor, label_tensor, impr

In [18]:
def collate_fn(batch):
    hist_list, cand_list, label_list, impr_list = zip(*batch)
    hist_batch = torch.stack(hist_list)        # (batch, hist_max, title_max)
    cand_batch = torch.stack(cand_list)        # (batch, title_max)
    label_batch = torch.stack(label_list).view(-1,1)
    impr_batch = list(impr_list)
    return hist_batch.to(device), cand_batch.to(device), label_batch.to(device), impr_batch

In [19]:
max_hist_title = 50
batch_size = 128

In [20]:
train_dataset = MINDDataset(data, news2idx, word2idx, max_hist_title, max_size_title)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

val_dataset = MINDDataset(val_data, news2idx, word2idx, max_hist_title, max_size_title)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [21]:
embedding_matrix = None

Fastformer

In [36]:
class AttentionPooling(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionPooling, self).__init__()
        self.att_fc1 = nn.Linear(hidden_size, hidden_size)
        self.att_fc2 = nn.Linear(hidden_size, 1)
        self.tanh = nn.Tanh()

    def forward(self, x, attn_mask=None):
        bz = x.shape[0]
        e = self.att_fc1(x)
        e = self.tanh(e)
        alpha = self.att_fc2(e)
        alpha = torch.exp(alpha)
        if attn_mask is not None:
            alpha = alpha * attn_mask.unsqueeze(2)
        alpha = alpha / (torch.sum(alpha, dim=1, keepdim=True) + 1e-8)
        x = torch.bmm(x.permute(0, 2, 1), alpha)
        x = torch.reshape(x, (bz, -1))
        return x

class FastSelfAttention(nn.Module):
  def __init__(self, hidden_size, num_attention_heads):
      super(FastSelfAttention, self).__init__()

      if hidden_size % num_attention_heads != 0:
          raise ValueError(
              "The hidden size (%d) is not a multiple of the number of attention "
              "heads (%d)" %
              (hidden_size, num_attention_heads))
      self.attention_head_size = int(hidden_size /num_attention_heads)
      self.num_attention_heads = num_attention_heads
      self.all_head_size = self.num_attention_heads * self.attention_head_size
      self.input_dim= hidden_size

      self.query = nn.Linear(self.input_dim, self.all_head_size)
      self.query_att = nn.Linear(self.all_head_size, self.num_attention_heads)
      self.key = nn.Linear(self.input_dim, self.all_head_size)
      self.key_att = nn.Linear(self.all_head_size, self.num_attention_heads)
      self.transform = nn.Linear(self.all_head_size, self.all_head_size)

      self.softmax = nn.Softmax(dim=-1)

  def transpose_for_scores(self, x):
      new_x_shape = x.size()[:-1] + (self.num_attention_heads,
                                      self.attention_head_size)
      x = x.view(*new_x_shape)
      return x.permute(0, 2, 1, 3)

  def forward(self, hidden_states, attention_mask):
      # batch_size, seq_len, num_head * head_dim, batch_size, seq_len
      batch_size, seq_len, _ = hidden_states.shape
      mixed_query_layer = self.query(hidden_states)
      mixed_key_layer = self.key(hidden_states)
      # batch_size, num_head, seq_len
      query_for_score = self.query_att(mixed_query_layer).transpose(1, 2) / self.attention_head_size**0.5
      # add attention mask
      if attention_mask.dim() == 2:
        attention_mask = attention_mask.unsqueeze(1).repeat(1, self.num_attention_heads, 1)
      query_for_score += attention_mask

      # batch_size, num_head, 1, seq_len
      query_weight = self.softmax(query_for_score).unsqueeze(2)

      # batch_size, num_head, seq_len, head_dim
      query_layer = self.transpose_for_scores(mixed_query_layer)

      # batch_size, num_head, head_dim, 1
      pooled_query = torch.matmul(query_weight, query_layer).transpose(1, 2).view(-1,1,self.num_attention_heads*self.attention_head_size)
      pooled_query_repeat= pooled_query.repeat(1, seq_len,1)
      # batch_size, num_head, seq_len, head_dim

      # batch_size, num_head, seq_len
      mixed_query_key_layer=mixed_key_layer* pooled_query_repeat

      query_key_score=(self.key_att(mixed_query_key_layer)/ self.attention_head_size**0.5).transpose(1, 2)

      # add attention mask
      query_key_score +=attention_mask

      # batch_size, num_head, 1, seq_len
      query_key_weight = self.softmax(query_key_score).unsqueeze(2)

      key_layer = self.transpose_for_scores(mixed_query_key_layer)
      pooled_key = torch.matmul(query_key_weight, key_layer)

      #query = value
      weighted_value =(pooled_key * query_layer).transpose(1, 2)
      weighted_value = weighted_value.reshape(
          weighted_value.size()[:-2] + (self.num_attention_heads * self.attention_head_size,))
      weighted_value = self.transform(weighted_value) + mixed_query_layer

      return weighted_value

In [31]:
class NewsEncoderFastformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, pretrained_emb=None):
        super(NewsEncoderFastformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=word2idx['<PAD>'])
        if pretrained_emb is not None:
            self.embed.weight.data.copy_(pretrained_emb)
            self.embed.weight.requires_grad = True
        self.fastformer = FastSelfAttention(embed_dim, num_heads)
        self.pooler = AttentionPooling(embed_dim)

    def forward(self, x):
        mask = x.bool().float()
        emb = self.embed(x)                           # (batch, seq_len, embed_dim)
        attn_out = self.fastformer(emb, mask)         # (batch, seq_len, embed_dim)
        news_vec = self.pooler(attn_out, mask)        # (batch, embed_dim)
        return news_vec

class UserEncoderFastformer(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(UserEncoderFastformer, self).__init__()
        self.fastformer = FastSelfAttention(embed_dim, num_heads)
        self.pooler = AttentionPooling(embed_dim)

    def forward(self, news_vecs):
        # news_vecs: (batch, hist_size, embed_dim)
        mask = news_vecs.abs().sum(dim=-1).bool().float()  # detect padding
        attn_out = self.fastformer(news_vecs, mask)
        user_vec = self.pooler(attn_out, mask)
        return user_vec

class FastformerNRMS(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, title_max, hist_max, pretrained_emb=None):
        super(FastformerNRMS, self).__init__()
        self.news_encoder = NewsEncoderFastformer(vocab_size, embed_dim, num_heads, pretrained_emb)
        self.user_encoder = UserEncoderFastformer(embed_dim, num_heads)

    def forward(self, hist, cand):
        batch_size = hist.size(0)
        hist_flat = hist.view(-1, hist.size(2))           # (batch*hist_max, title_max)
        hist_vecs = self.news_encoder(hist_flat)          # (batch*hist_max, embed_dim)
        hist_vecs = hist_vecs.view(batch_size, -1, hist_vecs.size(-1))  # (batch, hist_max, embed_dim)
        user_vec = self.user_encoder(hist_vecs)           # (batch, embed_dim)
        cand_vec = self.news_encoder(cand)                # (batch, embed_dim)
        score = torch.sum(user_vec * cand_vec, dim=1)     # (batch,)
        return score

In [37]:
embed_dim = 300
num_heads = 10
lr = 0.001

In [38]:
model = FastformerNRMS(vocab_size, embed_dim, num_heads, max_size_title, max_hist_title,
             pretrained_emb=embedding_matrix.to(device) if embedding_matrix is not None else None)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCEWithLogitsLoss()

In [39]:
epochs = 3

In [41]:
def ndcg_score(labels, scores, k=5):
    order = np.argsort(scores)[::-1]
    labels = np.array(labels)
    dcg = 0.0
    for i in range(min(k, len(labels))):
        rel = labels[order[i]]
        dcg += (2**rel - 1) / np.log2(i+2)
    ideal = np.sort(labels)[::-1]
    idcg = 0.0
    for i in range(min(k, int(np.sum(labels)))):
        idcg += 1.0 / np.log2(i+2)
    return dcg / idcg if idcg > 0 else 0.0

In [40]:
for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0.0
    for hist_batch, cand_batch, label_batch, _ in tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}"):
        optimizer.zero_grad()
        scores = model(hist_batch, cand_batch)
        loss = criterion(scores, label_batch.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch} - Pérdida promedio: {avg_loss:.4f}")
    # Evaluación en validación
    if val_loader is not None:
        model.eval()
        all_preds = {}
        with torch.no_grad():
            for hist_batch, cand_batch, label_batch, impr_batch in val_loader:
                scores = model(hist_batch, cand_batch).cpu().numpy()
                labels = label_batch.cpu().numpy().flatten()
                for impr_id, s, l in zip(impr_batch, scores, labels):
                    all_preds.setdefault(impr_id, []).append((s,l))
        # Calcular métricas en validación
        correct, total = 0, 0
        ndcg5_list = []
        for impr_id, recs in all_preds.items():
            scores = [s for (s,l) in recs]
            labels = [l for (s,l) in recs]
            preds_bin = [1 if s>=0.0 else 0 for s in scores]
            correct += sum(int(p==l) for p,l in zip(preds_bin, labels))
            total += len(labels)
            ndcg5_list.append(ndcg_score(labels, scores, k=5))
        acc = correct / total if total>0 else 0
        ndcg5 = np.mean(ndcg5_list) if ndcg5_list else 0
        print(f"Validación - Accuracy: {acc:.4f}, nDCG@5: {ndcg5:.4f}")

Epoch 1/3: 100%|██████████| 3594/3594 [10:09<00:00,  5.90it/s]


Epoch 1 - Pérdida promedio: 0.1810
Validación - Accuracy: 0.9402, nDCG@5: 0.2065


Epoch 2/3: 100%|██████████| 3594/3594 [10:06<00:00,  5.93it/s]


Epoch 2 - Pérdida promedio: 0.1736
Validación - Accuracy: 0.9398, nDCG@5: 0.2339


Epoch 3/3: 100%|██████████| 3594/3594 [10:04<00:00,  5.95it/s]


Epoch 3 - Pérdida promedio: 0.1688
Validación - Accuracy: 0.9390, nDCG@5: 0.2289
