# Neural Collaborative Filtering Tutorial

This notebook demonstrates how to train and use Neural Collaborative Filtering models for movie recommendations.

## Overview
- **Neural Collaborative Filtering (NCF)**: Deep learning approach combining GMF and MLP
- **Dataset**: MovieLens 32M with 32M+ ratings from 280K users
- **Goal**: Learn complex user-item interaction patterns

In [None]:
# Install requirements
!pip install torch pandas scikit-learn numpy matplotlib seaborn tqdm

In [None]:
import sys
import os
sys.path.append('../src')

import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from model import NeuralCollaborativeFiltering, SimpleNCF
from data_loader import NCFDataLoader
from trainer import NCFTrainer
from inference import NCFInference

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Loading and Exploration

In [None]:
# Load MovieLens data (adjust paths as needed)
ratings_path = '../../ml-32m/ratings.csv'
movies_path = '../../ml-32m/movies.csv'

# Check if files exist
if not os.path.exists(ratings_path):
    print(f"Please download MovieLens 32M dataset and place ratings.csv at {ratings_path}")
    print("Download from: https://files.grouplens.org/datasets/movielens/ml-32m.zip")
else:
    print(f"Found ratings file: {ratings_path}")
    
# Load data
ratings_df = pd.read_csv(ratings_path)
movies_df = pd.read_csv(movies_path) if os.path.exists(movies_path) else None

print(f"Ratings shape: {ratings_df.shape}")
print(f"Movies shape: {movies_df.shape if movies_df is not None else 'Not loaded'}")

# Display sample data
ratings_df.head()

In [None]:
# Data exploration
print("Dataset Statistics:")
print(f"Total ratings: {len(ratings_df):,}")
print(f"Unique users: {ratings_df['userId'].nunique():,}")
print(f"Unique movies: {ratings_df['movieId'].nunique():,}")
print(f"Rating range: {ratings_df['rating'].min()} - {ratings_df['rating'].max()}")
print(f"Average rating: {ratings_df['rating'].mean():.2f}")

# Plot rating distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
ratings_df['rating'].hist(bins=20, alpha=0.7)
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
user_counts = ratings_df['userId'].value_counts()
plt.hist(user_counts, bins=50, alpha=0.7)
plt.title('User Activity Distribution')
plt.xlabel('Number of Ratings per User')
plt.ylabel('Count')
plt.yscale('log')

plt.subplot(1, 3, 3)
movie_counts = ratings_df['movieId'].value_counts()
plt.hist(movie_counts, bins=50, alpha=0.7)
plt.title('Movie Popularity Distribution')
plt.xlabel('Number of Ratings per Movie')
plt.ylabel('Count')
plt.yscale('log')

plt.tight_layout()
plt.show()

## 2. Data Preprocessing

In [None]:
# Create data loader with preprocessing
data_loader = NCFDataLoader(
    ratings_path=ratings_path,
    movies_path=movies_path,
    min_ratings_per_user=20,  # Filter users with at least 20 ratings
    min_ratings_per_item=20   # Filter movies with at least 20 ratings
)

print("Data preprocessing completed!")
print(f"Filtered to {len(data_loader.ratings_df)} ratings")

In [None]:
# Create train/val/test splits
train_loader, val_loader, test_loader = data_loader.get_data_loaders(
    batch_size=1024,
    num_workers=2  # Adjust based on your system
)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

# Get model configuration
model_config = data_loader.get_model_config()
print(f"\nModel configuration: {model_config}")

## 3. Model Training

In [None]:
# Create NCF model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Start with SimpleNCF for faster training
model = SimpleNCF(
    num_users=model_config['num_users'],
    num_items=model_config['num_items'],
    embedding_dim=64,
    hidden_dim=128,
    dropout=0.2
)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(model)

In [None]:
# Create trainer
trainer = NCFTrainer(
    model=model,
    device=device,
    learning_rate=0.001,
    weight_decay=1e-5
)

# Train model (adjust epochs based on your time constraints)
print("Starting training...")
history = trainer.train(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=10,  # Increase for better performance
    patience=5,
    save_dir='../models'
)

print(f"Training completed! Best validation loss: {trainer.best_val_loss:.4f}")

In [None]:
# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.plot(history['train_losses'], label='Train Loss')
plt.plot(history['val_losses'], label='Val Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
plt.plot(history['val_rmses'], label='RMSE', color='orange')
plt.title('Validation RMSE')
plt.xlabel('Epoch')
plt.ylabel('RMSE')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 3)
plt.plot(history['val_maes'], label='MAE', color='green')
plt.title('Validation MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Model Evaluation

In [None]:
# Evaluate on test set
test_metrics = trainer.evaluate(test_loader)

print("Test Set Results:")
for metric, value in test_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Save model and encoders
data_loader.save_encoders('../models/encoders.pkl')
print("\nModel and encoders saved!")

## 5. Making Recommendations

In [None]:
# Create inference object
inference = NCFInference(
    model_path='../models/best_model.pt',
    encoders_path='../models/encoders.pkl',
    device=device
)

print("Inference object created!")

In [None]:
# Get a sample user for demonstration
sample_user = data_loader.ratings_df['userId'].iloc[0]
print(f"Getting recommendations for user {sample_user}")

# Get user's rating history
user_history = data_loader.ratings_df[data_loader.ratings_df['userId'] == sample_user]
print(f"User has {len(user_history)} ratings")

# Show some of user's highly rated movies
high_rated = user_history[user_history['rating'] >= 4.0].sort_values('rating', ascending=False)
if movies_df is not None:
    high_rated_with_titles = high_rated.merge(movies_df, on='movieId')
    print("\nUser's highly rated movies:")
    for _, row in high_rated_with_titles.head(5).iterrows():
        print(f"  {row['title']}: {row['rating']}/5.0")

# Get seen items to exclude from recommendations
seen_items = user_history['movieId'].tolist()

In [None]:
# Get recommendations
recommendations = inference.get_user_recommendations(
    user_id=sample_user,
    top_k=10,
    exclude_seen=True,
    seen_items=seen_items
)

print(f"\nTop 10 recommendations for user {sample_user}:")
for i, (movie_id, score) in enumerate(recommendations, 1):
    if movies_df is not None:
        movie_info = movies_df[movies_df['movieId'] == movie_id]
        if not movie_info.empty:
            title = movie_info.iloc[0]['title']
            genres = movie_info.iloc[0]['genres']
            print(f"  {i}. {title} (Score: {score:.3f})")
            print(f"     Genres: {genres}")
        else:
            print(f"  {i}. Movie ID {movie_id} (Score: {score:.3f})")
    else:
        print(f"  {i}. Movie ID {movie_id} (Score: {score:.3f})")

In [None]:
# Test rating prediction
test_pairs = [(sample_user, rec[0]) for rec in recommendations[:3]]

print(f"\nRating predictions for user {sample_user}:")
for user_id, movie_id in test_pairs:
    predicted_rating = inference.predict_rating(user_id, movie_id)
    if movies_df is not None:
        movie_info = movies_df[movies_df['movieId'] == movie_id]
        title = movie_info.iloc[0]['title'] if not movie_info.empty else f"Movie {movie_id}"
        print(f"  {title}: {predicted_rating:.2f}/5.0")
    else:
        print(f"  Movie {movie_id}: {predicted_rating:.2f}/5.0")

## 6. Model Analysis

In [None]:
# Analyze a popular movie - find similar items
if movies_df is not None:
    # Find a popular movie
    movie_popularity = data_loader.ratings_df['movieId'].value_counts()
    popular_movie = movie_popularity.index[0]
    
    movie_info = movies_df[movies_df['movieId'] == popular_movie]
    if not movie_info.empty:
        title = movie_info.iloc[0]['title']
        print(f"Finding movies similar to: {title}")
        
        similar_items = inference.find_similar_items(popular_movie, top_k=5)
        
        print("\nSimilar movies:")
        for movie_id, similarity in similar_items:
            similar_info = movies_df[movies_df['movieId'] == movie_id]
            if not similar_info.empty:
                similar_title = similar_info.iloc[0]['title']
                genres = similar_info.iloc[0]['genres']
                print(f"  {similar_title} (Similarity: {similarity:.3f})")
                print(f"    Genres: {genres}")

## 7. Comparison with Full NCF Model

In [None]:
# For comparison, let's create a full NCF model
# (You can train this for better performance)
full_ncf_model = NeuralCollaborativeFiltering(
    num_users=model_config['num_users'],
    num_items=model_config['num_items'],
    embedding_dim=64,
    hidden_layers=[128, 64],
    dropout=0.2
)

print(f"Full NCF model parameters: {sum(p.numel() for p in full_ncf_model.parameters()):,}")
print(f"Simple NCF model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"\nFull NCF has {sum(p.numel() for p in full_ncf_model.parameters()) / sum(p.numel() for p in model.parameters()):.1f}x more parameters")

## 8. Next Steps

To improve the model further:

1. **Train longer**: Increase epochs to 50+ for better convergence
2. **Try different architectures**: 
   - Full NCF model with GMF + MLP
   - DeepNCF with genre information
3. **Hyperparameter tuning**: Experiment with embedding dimensions, learning rates
4. **Data augmentation**: Add negative sampling, use implicit feedback
5. **Ensemble methods**: Combine multiple models for better performance

## Summary

This tutorial showed:
- ✅ Data loading and preprocessing for NCF
- ✅ Training a Simple NCF model
- ✅ Evaluating model performance
- ✅ Making personalized recommendations
- ✅ Finding similar items
- ✅ Model analysis and comparison

The Neural Collaborative Filtering approach provides a powerful foundation for learning complex user-item interaction patterns that traditional collaborative filtering might miss.