In [None]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
df_behaviors = pd.read_csv("beh.tsv", sep="\t", names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
df_news = pd.read_csv("news.tsv", sep="\t", names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'])

df_behaviors["Time"] = pd.to_datetime(df_behaviors["Time"])
cutoff = pd.to_datetime("2019-11-14")

behavior_train = df_behaviors[df_behaviors["Time"] < cutoff].copy()
behavior_val = df_behaviors[df_behaviors["Time"] >= cutoff].copy()


In [None]:
def tokenize(text):
    tokens = re.findall(r"[\w']+", text.lower())
    return tokens

longitudes = df_news["Title"].dropna().apply(lambda x: len(x.split()))
cantidad_menor_20 = (longitudes < 20).sum()
total = len(longitudes)
print(f"Titles with less than 20 words: {cantidad_menor_20} of {total} ({cantidad_menor_20 / total:.2%})")

Titles with less than 20 words: 23339 of 23701 (98.47%)


In [None]:
word2idx = {'<PAD>': 0, '<UNK>': 1}
idx = 2
news2idx = {}
max_size_title = 20


category2idx = {'<UNK>': 0}
subcategory2idx = {'<UNK>': 0}
cat_idx = 1
subcat_idx = 1

user2idx = {'<UNK>': 0}
user_idx = 1

In [None]:
for _, row in tqdm(df_news.iterrows(), total=df_news.shape[0]):
    news_id = row["NewsID"]
    title = row["Title"]
    category = row["Category"] if not pd.isna(row["Category"]) else "<UNK>"
    subcategory = row["SubCategory"] if not pd.isna(row["SubCategory"]) else "<UNK>"

    # Process title tokens
    tokens = [] if pd.isna(title) else tokenize(title)
    token_idxs = []
    for w in tokens[:max_size_title]:
        if w not in word2idx:
            word2idx[w] = idx
            idx += 1
        token_idxs.append(word2idx.get(w, word2idx['<UNK>']))

    # Pad title if shorter than max_size_title
    if len(token_idxs) < max_size_title:
        token_idxs += [word2idx['<PAD>']] * (max_size_title - len(token_idxs))

    # Process categories
    if category not in category2idx:
        category2idx[category] = cat_idx
        cat_idx += 1
    if subcategory not in subcategory2idx:
        subcategory2idx[subcategory] = subcat_idx
        subcat_idx += 1

    news2idx[news_id] = {
        'title': token_idxs,
        'category': category2idx[category],
        'subcategory': subcategory2idx[subcategory]
    }


100%|██████████| 23701/23701 [00:02<00:00, 9644.96it/s] 


In [None]:
for _, row in tqdm(behavior_train.iterrows(), total=behavior_train.shape[0]):
    user_id = row['UserID']
    if user_id not in user2idx:
        user2idx[user_id] = user_idx
        user_idx += 1

for _, row in tqdm(behavior_val.iterrows(), total=behavior_val.shape[0]):
    user_id = row['UserID']
    if user_id not in user2idx:
        user2idx[user_id] = user_idx
        user_idx += 1

100%|██████████| 12664/12664 [00:00<00:00, 15358.73it/s]
100%|██████████| 3047/3047 [00:00<00:00, 9610.30it/s] 


In [None]:
vocab_size = len(word2idx)
num_categories = len(category2idx)
num_subcategories = len(subcategory2idx)
num_users = len(user2idx)

print(f'Vocabulary: {vocab_size} words')
print(f'Categories: {num_categories}')
print(f'Subcategories: {num_subcategories}')
print(f'Users: {num_users}')

data = []
for _, row in tqdm(behavior_train.iterrows(), total=behavior_train.shape[0]):
    user_id = row['UserID']
    hist_str = row['History']
    hist_ids = [] if pd.isna(hist_str) else [nid for nid in hist_str.split() if nid]
    impr = row['ImpressionID']
    imps = [] if pd.isna(row['Impressions']) else row['Impressions'].split()

    for imp in imps:
        if len(imp) == 0:
            continue
        parts = imp.split('-')
        if len(parts) != 2:
            continue
        news_id, click = parts[0], parts[1]
        label = int(click)
        data.append((impr, user_id, hist_ids, news_id, label))

data = data[:460000]
print(f'Total interaction examples: {len(data)}')

Vocabulary: 25906 words
Categories: 18
Subcategories: 239
Users: 5001


100%|██████████| 12664/12664 [00:03<00:00, 3173.63it/s]

Total interaction examples: 452619





In [None]:
val_data = []
for _, row in tqdm(behavior_val.iterrows(), total=behavior_val.shape[0]):
    user_id = row['UserID']
    hist_str = row['History']
    hist_ids = [] if pd.isna(hist_str) else [nid for nid in hist_str.split() if nid]
    impr = row['ImpressionID']
    imps = row['Impressions'].split()

    for imp in imps:
        if len(imp) == 0:
            continue
        parts = imp.split('-')
        if len(parts) != 2:
            continue
        news_id, click = parts[0], parts[1]
        val_data.append((impr, user_id, hist_ids, news_id, int(click)))

val_data = val_data[:120000]
print(f'Total validation examples: {len(val_data)}')

100%|██████████| 3047/3047 [00:00<00:00, 3594.66it/s]

Total validation examples: 120000





In [None]:
class MINDDatasetLSTUR(Dataset):
    def __init__(self, interactions, news2idx, word2idx, user2idx, hist_max, title_max):
        self.interactions = interactions
        self.news2idx = news2idx
        self.word2idx = word2idx
        self.user2idx = user2idx
        self.hist_max = hist_max
        self.title_max = title_max

    def __len__(self):
        return len(self.interactions)

    def __getitem__(self, idx):
        impr, user_id, hist_ids, cand_id, label = self.interactions[idx]

        # Get user ID
        user_idx = self.user2idx.get(user_id, self.user2idx['<UNK>'])

        if len(hist_ids) > self.hist_max:
            hist_ids = hist_ids[-self.hist_max:]

        hist_seq = []
        for nid in hist_ids:
            if nid in self.news2idx:
                seq = self.news2idx[nid]['title']
            else:
                seq = [self.word2idx['<PAD>']] * self.title_max
            hist_seq.append(seq)

        if len(hist_seq) < self.hist_max:
            pad_seq = [self.word2idx['<PAD>']] * self.title_max
            for _ in range(self.hist_max - len(hist_seq)):
                hist_seq.insert(0, pad_seq)

        if cand_id in self.news2idx:
            cand_seq = self.news2idx[cand_id]['title']
            cand_cat = self.news2idx[cand_id]['category']
            cand_subcat = self.news2idx[cand_id]['subcategory']
        else:
            cand_seq = [self.word2idx['<PAD>']] * self.title_max
            cand_cat = 0
            cand_subcat = 0

        # Convert to tensors
        hist_tensor = torch.tensor(hist_seq, dtype=torch.long)
        cand_tensor = torch.tensor(cand_seq, dtype=torch.long)
        cand_cat_tensor = torch.tensor(cand_cat, dtype=torch.long)
        cand_subcat_tensor = torch.tensor(cand_subcat, dtype=torch.long)
        user_tensor = torch.tensor(user_idx, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.float)

        return hist_tensor, cand_tensor, cand_cat_tensor, cand_subcat_tensor, user_tensor, label_tensor, impr

def collate_fn_lstur(batch):
    hist_list, cand_list, cand_cat_list, cand_subcat_list, user_list, label_list, impr_list = zip(*batch)

    hist_batch = torch.stack(hist_list)
    cand_batch = torch.stack(cand_list)
    cand_cat_batch = torch.stack(cand_cat_list)
    cand_subcat_batch = torch.stack(cand_subcat_list)
    user_batch = torch.stack(user_list)
    label_batch = torch.stack(label_list).view(-1, 1)
    impr_batch = list(impr_list)

    return (hist_batch.to(device), cand_batch.to(device), cand_cat_batch.to(device),
            cand_subcat_batch.to(device), user_batch.to(device), label_batch.to(device), impr_batch)


In [None]:
class AttentionPooling(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionPooling, self).__init__()
        self.att_fc1 = nn.Linear(hidden_size, hidden_size)
        self.att_fc2 = nn.Linear(hidden_size, 1)
        self.tanh = nn.Tanh()

    def forward(self, x, attn_mask=None):
        bz = x.shape[0]
        e = self.att_fc1(x)
        e = self.tanh(e)
        alpha = self.att_fc2(e)
        alpha = torch.exp(alpha)
        if attn_mask is not None:
            alpha = alpha * attn_mask.unsqueeze(2)
        alpha = alpha / (torch.sum(alpha, dim=1, keepdim=True) + 1e-8)
        x = torch.bmm(x.permute(0, 2, 1), alpha)
        x = torch.reshape(x, (bz, -1))
        return x

class NewsEncoder(nn.Module):
    def __init__(self, vocab_size, num_categories, num_subcategories, embed_dim, num_filters=100, window_size=3):
        super(NewsEncoder, self).__init__()
        self.word_embed = nn.Embedding(vocab_size, embed_dim, padding_idx=word2idx['<PAD>'])
        self.category_embed = nn.Embedding(num_categories, embed_dim)
        self.subcategory_embed = nn.Embedding(num_subcategories, embed_dim)

        self.cnn = nn.Conv1d(embed_dim, num_filters, window_size, padding=1)
        self.relu = nn.ReLU()


        self.attention = AttentionPooling(num_filters)

        # Final projection
        self.final_dim = num_filters + embed_dim + embed_dim

    def forward(self, title, category, subcategory):
        # Title encoding
        title_emb = self.word_embed(title)
        title_emb_t = title_emb.transpose(1, 2)
        title_cnn = self.relu(self.cnn(title_emb_t))
        title_cnn_t = title_cnn.transpose(1, 2)

        mask = title.bool().float()
        title_vec = self.attention(title_cnn_t, mask)

        cat_vec = self.category_embed(category)
        subcat_vec = self.subcategory_embed(subcategory)

        news_vec = torch.cat([title_vec, cat_vec, subcat_vec], dim=1)
        return news_vec

In [None]:
class LongTermUserRepresentation(nn.Module):
    def __init__(self, num_users, embed_dim):
        super(LongTermUserRepresentation, self).__init__()
        self.user_embed = nn.Embedding(num_users, embed_dim)

    def forward(self, user_ids, mask_prob=0.0):
        user_vec = self.user_embed(user_ids)

        if self.training and mask_prob > 0:
            mask = torch.bernoulli(torch.full_like(user_vec[:, 0:1], 1 - mask_prob))
            user_vec = user_vec * mask

        return user_vec

class ShortTermUserRepresentation(nn.Module):
    def __init__(self, news_dim, hidden_dim):
        super(ShortTermUserRepresentation, self).__init__()
        self.gru = nn.GRU(news_dim, hidden_dim, batch_first=True)

    def forward(self, news_sequence, init_hidden=None):

        output, hidden = self.gru(news_sequence, init_hidden)
        return hidden.squeeze(0)

In [None]:
class LSTUR(nn.Module):
    def __init__(self, vocab_size, num_categories, num_subcategories, num_users,
                 embed_dim, hidden_dim, mask_prob=0.5, combination_method='concatenation'):
        super(LSTUR, self).__init__()
        self.news_encoder = NewsEncoder(vocab_size, num_categories, num_subcategories, embed_dim)
        self.long_term = LongTermUserRepresentation(num_users, hidden_dim)
        self.short_term = ShortTermUserRepresentation(self.news_encoder.final_dim, hidden_dim)
        self.mask_prob = mask_prob
        self.combination_method = combination_method

        if combination_method == 'concatenation':
            self.final_dim = hidden_dim * 2
        else:
            self.final_dim = hidden_dim

    def forward(self, hist_titles, hist_categories, hist_subcategories,
                cand_title, cand_category, cand_subcategory, user_ids):
        batch_size, hist_len, title_len = hist_titles.shape

        # Encode historical news
        hist_flat_titles = hist_titles.view(-1, title_len)
        hist_flat_cats = hist_categories.view(-1) if hist_categories.dim() > 1 else hist_categories.repeat(batch_size * hist_len)
        hist_flat_subcats = hist_subcategories.view(-1) if hist_subcategories.dim() > 1 else hist_subcategories.repeat(batch_size * hist_len)

        hist_cats = torch.zeros_like(hist_flat_cats)
        hist_subcats = torch.zeros_like(hist_flat_subcats)

        hist_news_vecs = self.news_encoder(hist_flat_titles, hist_cats, hist_subcats)
        hist_news_vecs = hist_news_vecs.view(batch_size, hist_len, -1)

        long_term_vec = self.long_term(user_ids, self.mask_prob)

        # Short-term user representation
        if self.combination_method == 'initialization':
            init_hidden = long_term_vec.unsqueeze(0)
            short_term_vec = self.short_term(hist_news_vecs, init_hidden)
            user_vec = short_term_vec
        else:
            short_term_vec = self.short_term(hist_news_vecs)
            user_vec = torch.cat([long_term_vec, short_term_vec], dim=1)

        # Encode candidate news
        cand_vec = self.news_encoder(cand_title, cand_category, cand_subcategory)

        # Compute click score
        score = torch.sum(user_vec * cand_vec[:, :self.final_dim], dim=1)
        return score

In [None]:
max_hist_title = 50
batch_size = 128
embed_dim = 300
hidden_dim = 300
lr = 0.001
epochs = 3
mask_prob = 0.5

train_dataset = MINDDatasetLSTUR(data, news2idx, word2idx, user2idx, max_hist_title, max_size_title)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_lstur)

val_dataset = MINDDatasetLSTUR(val_data, news2idx, word2idx, user2idx, max_hist_title, max_size_title)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_lstur)

# Initialize model
model = LSTUR(vocab_size, num_categories, num_subcategories, num_users,
              embed_dim, hidden_dim, mask_prob, combination_method='concatenation')
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCEWithLogitsLoss()

In [None]:

from sklearn.metrics import roc_auc_score

def ndcg_score(labels, scores, k=5):
    order = np.argsort(scores)[::-1]
    labels = np.array(labels)
    dcg = 0.0
    for i in range(min(k, len(labels))):
        rel = labels[order[i]]
        dcg += (2**rel - 1) / np.log2(i+2)
    ideal = np.sort(labels)[::-1]
    idcg = 0.0
    for i in range(min(k, int(np.sum(labels)))):
        idcg += 1.0 / np.log2(i+2)
    return dcg / idcg if idcg > 0 else 0.0

def mrr_score(labels, scores):
    order = np.argsort(scores)[::-1]
    labels = np.array(labels)[order]
    for rank, label in enumerate(labels, start=1):
        if label == 1:
            return 1.0 / rank
    return 0.0

In [None]:
from sklearn.metrics import roc_auc_score

def ndcg_score(labels, scores, k=5):
    order = np.argsort(scores)[::-1]
    labels = np.array(labels)
    dcg = 0.0
    for i in range(min(k, len(labels))):
        rel = labels[order[i]]
        dcg += (2**rel - 1) / np.log2(i+2)
    ideal = np.sort(labels)[::-1]
    idcg = 0.0
    for i in range(min(k, int(np.sum(labels)))):
        idcg += 1.0 / np.log2(i+2)
    return dcg / idcg if idcg > 0 else 0.0

def mrr_score(labels, scores):
    order = np.argsort(scores)[::-1]
    labels = np.array(labels)[order]
    for rank, label in enumerate(labels, start=1):
        if label == 1:
            return 1.0 / rank
    return 0.0

# Training loop
for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0.0

    for batch_data in tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}"):
        hist_batch, cand_batch, cand_cat_batch, cand_subcat_batch, user_batch, label_batch, _ = batch_data

        # Create dummy category/subcategory tensors for historical news (simplified)
        hist_cat_batch = torch.zeros(hist_batch.shape[0], hist_batch.shape[1], dtype=torch.long, device=device)
        hist_subcat_batch = torch.zeros(hist_batch.shape[0], hist_batch.shape[1], dtype=torch.long, device=device)

        optimizer.zero_grad()
        scores = model(hist_batch, hist_cat_batch, hist_subcat_batch,
                      cand_batch, cand_cat_batch, cand_subcat_batch, user_batch)
        loss = criterion(scores, label_batch.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch} - Average Loss: {avg_loss:.4f}")

    # Validation
    if val_loader is not None:
        model.eval()
        all_preds = {}
        all_scores_flat = []
        all_labels_flat = []

        with torch.no_grad():
            for batch_data in val_loader:
                hist_batch, cand_batch, cand_cat_batch, cand_subcat_batch, user_batch, label_batch, impr_batch = batch_data

                hist_cat_batch = torch.zeros(hist_batch.shape[0], hist_batch.shape[1], dtype=torch.long, device=device)
                hist_subcat_batch = torch.zeros(hist_batch.shape[0], hist_batch.shape[1], dtype=torch.long, device=device)

                scores = model(hist_batch, hist_cat_batch, hist_subcat_batch,
                              cand_batch, cand_cat_batch, cand_subcat_batch, user_batch).cpu().numpy()
                labels = label_batch.cpu().numpy().flatten()

                all_scores_flat.extend(scores)
                all_labels_flat.extend(labels)

                for impr_id, s, l in zip(impr_batch, scores, labels):
                    all_preds.setdefault(impr_id, []).append((s, l))

        # Calculate validation metrics
        correct, total = 0, 0
        ndcg5_list = []
        ndcg10_list = []
        mrr_list = []

        for impr_id, recs in all_preds.items():
            scores = [s for (s, l) in recs]
            labels = [l for (s, l) in recs]
            preds_bin = [1 if s >= 0.0 else 0 for s in scores]
            correct += sum(int(p == l) for p, l in zip(preds_bin, labels))
            total += len(labels)
            ndcg5_list.append(ndcg_score(labels, scores, k=5))
            ndcg10_list.append(ndcg_score(labels, scores, k=10))
            mrr_list.append(mrr_score(labels, scores))

        # Calculate overall metrics
        acc = correct / total if total > 0 else 0
        auc = roc_auc_score(all_labels_flat, all_scores_flat) if len(set(all_labels_flat)) > 1 else 0
        ndcg5 = np.mean(ndcg5_list) if ndcg5_list else 0
        ndcg10 = np.mean(ndcg10_list) if ndcg10_list else 0
        mrr = np.mean(mrr_list) if mrr_list else 0

        print(f"Validation - Accuracy: {acc:.4f}, AUC: {auc:.4f}, MRR: {mrr:.4f}, nDCG@5: {ndcg5:.4f}, nDCG@10: {ndcg10:.4f}")

print("Training completed!")