In [1]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

KeyboardInterrupt: 

In [None]:
df_behaviors = pd.read_csv("behaviors_sample.tsv", sep="\t", names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
df_news = pd.read_csv("news_sample.tsv", sep="\t", names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'])

In [None]:
df_behaviors.shape

In [None]:
df_behaviors["Time"] = pd.to_datetime(df_behaviors["Time"])
cutoff = pd.to_datetime("2019-11-14")

behavior_train = df_behaviors[df_behaviors["Time"] < cutoff].copy()
behavior_val   = df_behaviors[df_behaviors["Time"] >= cutoff].copy()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Usando dispositivo: {device}')

In [None]:
def tokenize(text):
    tokens = re.findall(r"[\w']+", text.lower())
    return tokens

In [None]:
longitudes = df_news["Title"].dropna().apply(lambda x: len(x.split()))
cantidad_menor_20 = (longitudes < 20).sum()
total = len(longitudes)

print(f"Títulos con menos de 20 palabras: {cantidad_menor_20} de {total} ({cantidad_menor_20 / total:.2%})")

In [None]:
word2idx = {'<PAD>': 0, '<UNK>': 1}
idx = 2 # Por <UNK> y <PAD>
news2idx = {}  # Mapeo: news_id -> lista de índices de palabras (padded/trunc)
max_size_title = 20

In [None]:
for _, row in tqdm(df_news.iterrows(), total=df_news.shape[0]):
    news_id = row["NewsID"]
    title = row["Title"]
    tokens = [] if pd.isna(title) else tokenize(title)
    token_idxs = []
    for w in tokens[:max_size_title]:  # truncar título largo
        if w not in word2idx:
            word2idx[w] = idx
            idx += 1
        token_idxs.append(word2idx.get(w, word2idx['<UNK>']))
    # Rellenar con PAD si es más corto que title_max
    if len(token_idxs) < max_size_title:
        token_idxs += [word2idx['<PAD>']] * (max_size_title - len(token_idxs))
    news2idx[news_id] = token_idxs

In [None]:
vocab_size = len(word2idx)
print(f'Vocabulario: {vocab_size} palabras')

In [None]:
data = []
for _, row in tqdm(behavior_train.iterrows(), total=behavior_train.shape[0]):
    hist_str = row['History']
    hist_ids = [] if pd.isna(hist_str) else [nid for nid in hist_str.split() if nid]
    impr = row['ImpressionID']
    imps = [] if pd.isna(row['Impressions']) else row['Impressions'].split()
    for imp in imps:
        if len(imp) == 0:
            continue
        parts = imp.split('-')
        if len(parts) != 2:
            continue
        news_id, click = parts[0], parts[1]
        label = int(click)
        data.append((impr, hist_ids, news_id, label))

In [None]:
# data = data[:460000]

In [None]:
print(f'Total de ejemplos de interacción: {len(data)}')

In [None]:
val_data = []

for _, row in tqdm(behavior_val.iterrows(), total=behavior_val.shape[0]):
    hist_str = row['History']
    hist_ids = [] if pd.isna(hist_str) else [nid for nid in hist_str.split() if nid]
    impr = row['ImpressionID']
    imps = row['Impressions'].split()
    for imp in imps:
        if len(imp) == 0:
            continue
        parts = imp.split('-')
        if len(parts) != 2:
            continue
        news_id, click = parts[0], parts[1]
        val_data.append((impr, hist_ids, news_id, int(click)))

In [None]:
# val_data = val_data[:120000]

In [None]:
print(f'Total ejemplos validación: {len(val_data)}')

In [None]:
class MINDDataset(Dataset):
    def __init__(self, interactions, news2idx, word2idx, hist_max, title_max):
        self.interactions = interactions
        self.news2idx = news2idx
        self.word2idx = word2idx
        self.hist_max = hist_max
        self.title_max = title_max
    def __len__(self):
        return len(self.interactions)
    def __getitem__(self, idx):
        impr, hist_ids, cand_id, label = self.interactions[idx]
        # Truncar o pad historial
        if len(hist_ids) > self.hist_max:
            hist_ids = hist_ids[-self.hist_max:]
        hist_seq = []
        for nid in hist_ids:
            seq = self.news2idx.get(nid, [self.word2idx['<PAD>']] * self.title_max)
            hist_seq.append(seq)
        if len(hist_seq) < self.hist_max:
            pad_seq = [self.word2idx['<PAD>']] * self.title_max
            for _ in range(self.hist_max - len(hist_seq)):
                hist_seq.insert(0, pad_seq)
        # Noticia candidata
        cand_seq = self.news2idx.get(cand_id, [self.word2idx['<PAD>']] * self.title_max)
        # Convertir a tensores
        hist_tensor = torch.tensor(hist_seq, dtype=torch.long)
        cand_tensor = torch.tensor(cand_seq, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.float)
        return hist_tensor, cand_tensor, label_tensor, impr

In [None]:
def collate_fn(batch):
    hist_list, cand_list, label_list, impr_list = zip(*batch)
    hist_batch = torch.stack(hist_list)        # (batch, hist_max, title_max)
    cand_batch = torch.stack(cand_list)        # (batch, title_max)
    label_batch = torch.stack(label_list).view(-1,1)
    impr_batch = list(impr_list)
    return hist_batch.to(device), cand_batch.to(device), label_batch.to(device), impr_batch

In [None]:
max_hist_title = 50
batch_size = 128

In [None]:
train_dataset = MINDDataset(data, news2idx, word2idx, max_hist_title, max_size_title)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

val_dataset = MINDDataset(val_data, news2idx, word2idx, max_hist_title, max_size_title)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
embed_dim = 300
num_heads = 10
lr = 0.001

In [None]:
glove = False

if glove:
    embedding_matrix = np.random.normal(scale=0.6, size=(vocab_size, embed_dim))
    found = 0
    with open("glove.6B.300d.txt", 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.rstrip().split(' ')
            word = parts[0]
            if word in word2idx:
                vec = np.array(parts[1:], dtype=np.float32)
                if vec.shape[0] == embed_dim:
                    embedding_matrix[word2idx[word]] = vec
                    found += 1
    print(f'Palabras encontradas en GloVe: {found}/{vocab_size}')
    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float)
else:
    embedding_matrix = None

In [None]:
class NewsEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, title_max, pretrained_emb=None):
        super(NewsEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=word2idx['<PAD>'])
        if pretrained_emb is not None:
            self.embed.weight.data.copy_(pretrained_emb)
            self.embed.weight.requires_grad = True
        self.word_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.Ww = nn.Linear(embed_dim, embed_dim)
        self.qw = nn.Linear(embed_dim, 1, bias=False)
    def forward(self, x):
        emb = self.embed(x)  # (batch, title_max, embed_dim)
        attn_out, _ = self.word_attn(emb, emb, emb)  # atención self-word
        M = torch.tanh(self.Ww(attn_out))
        alpha = torch.softmax(self.qw(M).squeeze(-1), dim=1)
        r = torch.sum(attn_out * alpha.unsqueeze(-1), dim=1)  # (batch, embed_dim)
        return r

class UserEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(UserEncoder, self).__init__()
        self.news_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.Wn = nn.Linear(embed_dim, embed_dim)
        self.qn = nn.Linear(embed_dim, 1, bias=False)
    def forward(self, news_vecs):
        attn_out, _ = self.news_attn(news_vecs, news_vecs, news_vecs)  # atención entre noticias
        M = torch.tanh(self.Wn(attn_out))
        beta = torch.softmax(self.qn(M).squeeze(-1), dim=1)
        u = torch.sum(attn_out * beta.unsqueeze(-1), dim=1)  # (batch, embed_dim)
        return u

class NRMS(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, title_max, hist_max, pretrained_emb=None):
        super(NRMS, self).__init__()
        self.news_encoder = NewsEncoder(vocab_size, embed_dim, num_heads, title_max, pretrained_emb)
        self.user_encoder = UserEncoder(embed_dim, num_heads)
    def forward(self, hist, cand):
        batch_size = hist.size(0)
        hist_flat = hist.view(-1, hist.size(2))  # (batch*hist_max, title_max)
        hist_vecs = self.news_encoder(hist_flat)  # (batch*hist_max, embed_dim)
        hist_vecs = hist_vecs.view(batch_size, -1, hist_vecs.size(-1))  # (batch, hist_max, embed_dim)
        user_vec = self.user_encoder(hist_vecs)   # (batch, embed_dim)
        cand_vec = self.news_encoder(cand)        # (batch, embed_dim)
        score = torch.sum(user_vec * cand_vec, dim=1)  # (batch,)
        return score  # logits

In [None]:
model = NRMS(vocab_size, embed_dim, num_heads, max_size_title, max_hist_title,
             pretrained_emb=embedding_matrix.to(device) if embedding_matrix is not None else None)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCEWithLogitsLoss()

In [None]:
from sklearn.metrics import roc_auc_score

def ndcg_score(labels, scores, k=5):
    order = np.argsort(scores)[::-1]
    labels = np.array(labels)
    dcg = 0.0
    for i in range(min(k, len(labels))):
        rel = labels[order[i]]
        dcg += (2**rel - 1) / np.log2(i+2)
    ideal = np.sort(labels)[::-1]
    idcg = 0.0
    for i in range(min(k, int(np.sum(labels)))):
        idcg += 1.0 / np.log2(i+2)
    return dcg / idcg if idcg > 0 else 0.0

def mrr_score(labels, scores):
    order = np.argsort(scores)[::-1]
    labels = np.array(labels)[order]
    for rank, label in enumerate(labels, start=1):
        if label == 1:
            return 1.0 / rank
    return 0.0


In [None]:
epochs = 3

In [None]:
best_ndcg5 = 0.0
best_model_state = None

for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0.0
    for hist_batch, cand_batch, label_batch, _ in tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}"):
        optimizer.zero_grad()
        scores = model(hist_batch, cand_batch)
        loss = criterion(scores, label_batch.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch} - Pérdida promedio: {avg_loss:.4f}")

    # Evaluación en validación
    if val_loader is not None:
        model.eval()
        all_preds = {}
        with torch.no_grad():
            for hist_batch, cand_batch, label_batch, impr_batch in val_loader:
                scores = model(hist_batch, cand_batch).cpu().numpy()
                labels = label_batch.cpu().numpy().flatten()
                for impr_id, s, l in zip(impr_batch, scores, labels):
                    all_preds.setdefault(impr_id, []).append((s, l))

        correct, total = 0, 0
        ndcg5_list, ndcg10_list, mrr_list, auc_scores = [], [], [], []

        for impr_id, recs in all_preds.items():
            scores = [s for (s, l) in recs]
            labels = [l for (s, l) in recs]

            # Accuracy binaria
            preds_bin = [1 if s >= 0.0 else 0 for s in scores]
            correct += sum(int(p == l) for p, l in zip(preds_bin, labels))
            total += len(labels)

            # Métricas ranking
            ndcg5_list.append(ndcg_score(labels, scores, k=5))
            ndcg10_list.append(ndcg_score(labels, scores, k=10))
            mrr_list.append(mrr_score(labels, scores))

            # AUC por sesión (si hay al menos un positivo y un negativo)
            if len(set(labels)) > 1:
                auc_scores.append(roc_auc_score(labels, scores))

        acc = correct / total if total > 0 else 0
        ndcg5 = np.mean(ndcg5_list)
        ndcg10 = np.mean(ndcg10_list)
        mrr = np.mean(mrr_list)
        auc = np.mean(auc_scores) if auc_scores else 0.0

        if ndcg5 > best_ndcg5:
            best_ndcg5 = ndcg5
            best_model_state = model.state_dict()
            print(f"Nuevo mejor modelo guardado (nDCG@5 = {ndcg5:.4f})")


        print(f"Validación - Accuracy: {acc:.4f}, AUC: {auc:.4f}, MRR: {mrr:.4f}, nDCG@5: {ndcg5:.4f}, nDCG@10: {ndcg10:.4f}, ")

if best_model_state is not None:
    torch.save(best_model_state, "nrms_best.pt")
    print(f"Modelo con mejor nDCG@5 guardado en nrms_best.pt")