# MovieLens Recommendation System Experiments

This notebook compares sequential (Surprise SVD) and distributed (Spark ALS) recommendation approaches on the MovieLens dataset.

## Objectives
1. Load and preprocess MovieLens dataset
2. Implement sequential baseline using Surprise SVD
3. Implement distributed model using Spark ALS
4. Compare performance metrics and scalability
5. Visualize results and generate insights


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
import os
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Surprise library for collaborative filtering
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise import accuracy

# PySpark for distributed computing
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, mean, stddev
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("All libraries imported successfully!")


## 1. Dataset Loading and Exploration


In [None]:
def load_movielens_data(data_path='./Dataset/'):
    """
    Load MovieLens dataset files and return as pandas DataFrames
    """
    print("Loading MovieLens dataset...")
    
    # Load datasets
    ratings = pd.read_csv(os.path.join(data_path, 'ratings.csv'))
    movies = pd.read_csv(os.path.join(data_path, 'movies.csv'))
    tags = pd.read_csv(os.path.join(data_path, 'tags.csv'))
    links = pd.read_csv(os.path.join(data_path, 'links.csv'))
    
    return ratings, movies, tags, links

def print_dataset_statistics(ratings, movies, tags, links):
    """
    Print comprehensive statistics about the dataset
    """
    print("\n=== DATASET STATISTICS ===")
    print(f"Ratings: {len(ratings):,} rows")
    print(f"Movies: {len(movies):,} rows")
    print(f"Tags: {len(tags):,} rows")
    print(f"Links: {len(links):,} rows")
    
    print(f"\nUnique users: {ratings['userId'].nunique():,}")
    print(f"Unique movies: {ratings['movieId'].nunique():,}")
    print(f"Unique tags: {tags['tag'].nunique() if len(tags) > 0 else 0:,}")
    
    print(f"\nRating range: {ratings['rating'].min():.1f} - {ratings['rating'].max():.1f}")
    print(f"Average rating: {ratings['rating'].mean():.2f}")
    print(f"Rating std: {ratings['rating'].std():.2f}")
    
    print(f"\nAverage ratings per user: {len(ratings) / ratings['userId'].nunique():.1f}")
    print(f"Average ratings per movie: {len(ratings) / ratings['movieId'].nunique():.1f}")
    
    # Rating distribution
    print("\nRating distribution:")
    rating_counts = ratings['rating'].value_counts().sort_index()
    for rating, count in rating_counts.items():
        print(f"  {rating}: {count:,} ({count/len(ratings)*100:.1f}%)")

# Load the data
ratings, movies, tags, links = load_movielens_data()
print_dataset_statistics(ratings, movies, tags, links)


## 2. Data Preprocessing and Splitting


In [None]:
def stratified_train_test_split(ratings, test_size=0.2, min_ratings=5):
    """
    Perform stratified train/test split per user
    Only include users with at least min_ratings ratings
    """
    print(f"\nPerforming stratified {int((1-test_size)*100)}/{int(test_size*100)} train/test split...")
    
    # Filter users with sufficient ratings
    user_counts = ratings['userId'].value_counts()
    valid_users = user_counts[user_counts >= min_ratings].index
    filtered_ratings = ratings[ratings['userId'].isin(valid_users)].copy()
    
    print(f"Filtered to {len(valid_users):,} users with >= {min_ratings} ratings")
    print(f"Remaining ratings: {len(filtered_ratings):,}")
    
    train_data = []
    test_data = []
    
    for user_id in tqdm(valid_users, desc="Splitting users"):
        user_ratings = filtered_ratings[filtered_ratings['userId'] == user_id]
        
        if len(user_ratings) >= min_ratings:
            # Sort by timestamp to maintain temporal order
            user_ratings = user_ratings.sort_values('timestamp')
            
            # Split maintaining chronological order
            n_test = max(1, int(len(user_ratings) * test_size))
            
            train_ratings = user_ratings[:-n_test]
            test_ratings = user_ratings[-n_test:]
            
            train_data.append(train_ratings)
            test_data.append(test_ratings)
    
    train_df = pd.concat(train_data, ignore_index=True)
    test_df = pd.concat(test_data, ignore_index=True)
    
    print(f"\nTrain set: {len(train_df):,} ratings")
    print(f"Test set: {len(test_df):,} ratings")
    print(f"Split ratio: {len(train_df)/(len(train_df)+len(test_df))*100:.1f}% / {len(test_df)/(len(train_df)+len(test_df))*100:.1f}%")
    
    return train_df, test_df

def create_sample_split(train_df, test_df, sample_size=10000):
    """
    Create a small sample for quick testing
    """
    print(f"\nCreating sample split with {sample_size:,} training ratings...")
    
    # Sample users proportionally
    sample_train = train_df.sample(n=min(sample_size, len(train_df)), random_state=42)
    sample_users = sample_train['userId'].unique()
    
    # Get corresponding test data for sampled users
    sample_test = test_df[test_df['userId'].isin(sample_users)]
    
    print(f"Sample train: {len(sample_train):,} ratings, {len(sample_users):,} users")
    print(f"Sample test: {len(sample_test):,} ratings")
    
    return sample_train, sample_test

# Perform the splits
train_df, test_df = stratified_train_test_split(ratings)
sample_train, sample_test = create_sample_split(train_df, test_df)

# Save splits to CSV files
print("\nSaving splits to CSV files...")
train_df.to_csv('./raw/train.csv', index=False)
test_df.to_csv('./raw/test.csv', index=False)
sample_train.to_csv('./raw/sample_train.csv', index=False)
sample_test.to_csv('./raw/sample_test.csv', index=False)
print("Splits saved successfully!")
