# MIND News Recommendation System

## Architecture Overview:
1. **News Encoder:** BERT/RoBERTa + Attention Pooling (Title → Vector)
2. **User Encoder:** Attention Pooling over history (History → User Vector)
3. **Prediction:** Dot product similarity + Softmax
4. **Training:** Negative Sampling with Cross-Entropy Loss

## Pipeline:
```
User History → News Encoder → User Encoder → User Vector
                                                  ↓
Candidate News → News Encoder → Candidate Vectors → Dot Product → Click Probability
```


## Import Libraries


In [None]:
%pip install --quiet -r requirements.txt

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from src import *

# Set style for better plots
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100

print("All libraries imported successfully!")


## Configuration


In [None]:
print_config()

## Download dataset from Kaggle

In [None]:
# download_dataset()

## News


### Load and Explore News Data


In [None]:
news_df = load_news_data()

print(f"\nNews Articles Samples:")
news_df.head(10)

### Tokenize News Articles


In [None]:
print(f"Initializing tokenizer: {config['MODEL_NAME']}...")
news_features, news_categories = tokenize_news(news_df)
print(f"Tokenized {len(news_features):,} news articles (including <PAD>)")


## Create Training Samples


In [None]:
print("Creating training samples...")
print(f"Strategy: 1 positive + {config['NEG_SAMPLES']} negative samples per training example")

train_behaviors_df, val_behaviors_df = load_behaviors_data()

train_samples = create_behavior_samples(train_behaviors_df, 'train')
val_samples = create_behavior_samples(val_behaviors_df, 'val')

print(f"\nCreated {len(train_samples):,} training samples from {config['BEHAVIORS_TRAIN_PATH']}")
print(f"Created {len(val_samples):,} validation samples from {config['BEHAVIORS_VAL_PATH']}")

## Create PyTorch Dataset and DataLoader


In [None]:
class NewsDataset(Dataset):
    """PyTorch Dataset for news recommendation with category support."""
    
    def __init__(self, samples, news_features, news_categories, config):
        self.samples = samples
        self.news_features = news_features
        self.news_categories = news_categories
        self.config = config
        self.use_category = config.get('USE_CATEGORY_ATTENTION', False)
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        # Process history
        history_ids = sample['history']
        if len(history_ids) < self.config['MAX_HISTORY_LEN']:
            history_ids = history_ids + ['<PAD>'] * (self.config['MAX_HISTORY_LEN'] - len(history_ids))
        
        hist_input_ids = torch.cat([
            self.news_features.get(nid, self.news_features['<PAD>'])['input_ids'] 
            for nid in history_ids
        ])
        hist_attn_mask = torch.cat([
            self.news_features.get(nid, self.news_features['<PAD>'])['attention_mask'] 
            for nid in history_ids
        ])
        
        # Process candidates
        candidate_ids = sample['candidates']
        cand_input_ids = torch.cat([
            self.news_features.get(nid, self.news_features['<PAD>'])['input_ids'] 
            for nid in candidate_ids
        ])
        cand_attn_mask = torch.cat([
            self.news_features.get(nid, self.news_features['<PAD>'])['attention_mask'] 
            for nid in candidate_ids
        ])
        
        result = {
            'history_input_ids': hist_input_ids,
            'history_attn_mask': hist_attn_mask,
            'candidate_input_ids': cand_input_ids,
            'candidate_attn_mask': cand_attn_mask,
            'label': torch.tensor(sample['label'], dtype=torch.long)
        }
        
        # Add categories if enabled
        if self.use_category:
            hist_categories = torch.tensor([
                self.news_categories.get(nid, 0) for nid in history_ids
            ], dtype=torch.long)
            cand_categories = torch.tensor([
                self.news_categories.get(nid, 0) for nid in candidate_ids
            ], dtype=torch.long)
            result['history_categories'] = hist_categories
            result['candidate_categories'] = cand_categories
        
        return result

# Create datasets
print("Creating PyTorch datasets...")
train_dataset = NewsDataset(train_samples, news_features, news_categories, config)
val_dataset = NewsDataset(val_samples, news_features, news_categories, config)

print(f"Train dataset: {len(train_dataset):,} samples")
print(f"Validation dataset: {len(val_dataset):,} samples")

# Create dataloaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=config['BATCH_SIZE'], 
    shuffle=True,
    num_workers=0  # Set to 0 to avoid multiprocessing issues
)

val_loader = DataLoader(
    val_dataset,
    batch_size=1,  # Variable number of candidates per sample
    shuffle=False,
    num_workers=0
)

print(f"Train batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")


## Initialize Model


In [None]:
model = get_model(config['MODEL_TYPE'], config).to(config['DEVICE'])

for param in model.news_encoder.embedding.parameters():
    param.requires_grad = False

# # Unfreeze only the last N transformer layers
# num_layers_to_unfreeze = 2  # Adjust this (try 1-3)
# for layer in model.news_encoder.embedding.encoder.layer[-num_layers_to_unfreeze:]:
#     for param in layer.parameters():
#         param.requires_grad = True

optimizer = torch.optim.AdamW(model.parameters(), lr=config['LR'])
# optimizer = torch.optim.AdamW([
#     # Pretrained RoBERTa - lower learning rate
#     {'params': model.news_encoder.embedding.parameters(), 'lr': 1e-5},
#     # Custom attention layers - higher learning rate
#     {'params': model.news_encoder.attention_pooling.parameters(), 'lr': 1e-4},
#     {'params': model.user_attention.parameters(), 'lr': 1e-4}
# ], weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

print(f"Model initialized on {config['DEVICE']}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


## Train the Model


In [None]:
train_model(model, train_loader, val_loader, optimizer, criterion, config['DEVICE'])

## Training Results

In [None]:
# Load the checkpoint to get training history
checkpoint = torch.load('mind_news_rec.pth', map_location=config['DEVICE'])
history = checkpoint['history']

# Print final metrics
print("\nFinal Training Results:")
print(f"Final Training Loss: {history['train_loss'][-1]:.4f}")
print(f"Best Validation AUC: {max(history['val_auc']):.4f}")
print(f"Best Validation MRR: {max(history['val_mrr']):.4f}")
print(f"Best Validation NDCG@5: {max(history['val_ndcg@5']):.4f}")
print(f"Best Validation NDCG@10: {max(history['val_ndcg@10']):.4f}")

# Print loss values for each epoch
print("\nLoss per Epoch:")
print("=" * 30)
for i, (epoch, loss) in enumerate(zip(history['epoch'], history['train_loss'])):
    print(f"Epoch {epoch:2d}: {loss:.6f}")
print("=" * 30)

# Print model configuration for reference
print("\nModel Configuration:")
print("=" * 50)
for key, value in checkpoint['config'].items():
    print(f"{key:25s} : {value}")
print("=" * 50)