# Exploring and Evaluating Recommender Systems on MovieLens

This notebook explores multiple recommender system approaches with increasing modeling complexity. Classical methods, including Content-Based Filtering and User-Based Collaborative Filtering, are presented as baseline techniques. More advanced models—Item-Based Collaborative Filtering, Matrix Factorization via SVD, and a neural autoencoder—are evaluated more rigorously using standard Top-N recommendation metrics.

# ETL

In [None]:
import sys
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [None]:
# Add the root folder to sys.path
sys.path.append(os.path.abspath(".."))
from app.etl import run_etl

#Paths to the dataset files
file_paths = {
    'ratings': '../data/ml_100k/u.data',
    'movies': '../data/ml_100k/u.item'
}

# Run the ETL pipeline
preprocessed_data = run_etl(file_paths, save_path='../app/preprocessed_movielens.csv')

# Check the original data
print(preprocessed_data.head())

In [None]:
# Load the smaller and pre processed MovieLens dataset
ratings = pd.read_csv('../data/ml_100k/u.data', sep='\t',
                      names=['user_id', 'movie_id', 'rating', 'timestamp'])

movies = pd.read_csv('../data/ml_100k/u.item', sep='|', encoding='latin-1', header=None,
                     names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
                            'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
                            'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
                            'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

# Encode genres
genre_cols = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
              'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
              'Sci-Fi', 'Thriller', 'War', 'Western']

genres_encoded = movies[genre_cols].copy()

# Compute similarity matrix
similarity_matrix = cosine_similarity(genres_encoded)
similarity_df = pd.DataFrame(similarity_matrix, index=movies['movie_id'], columns=movies['movie_id'])

#### Train / Test Split (Per User)

In [None]:
def train_test_split_by_user(ratings, test_ratio=0.2, min_ratings=5):
    train, test = [], []

    for user_id, group in ratings.groupby("user_id"):
        if len(group) < min_ratings:
            continue

        group = group.sample(frac=1, random_state=42)
        split_idx = int(len(group) * (1 - test_ratio))

        train.append(group.iloc[:split_idx])
        test.append(group.iloc[split_idx:])

    return pd.concat(train), pd.concat(test)

ratings_df = ratings.copy()
train_df, test_df = train_test_split_by_user(ratings_df)

# Build user-item matrix
#user_item_matrix = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

user_item_matrix = train_df.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

#### Ground Truth (What Each User Actually Liked)

In [None]:
def build_ground_truth(test_df, threshold=4.0):
    ground_truth = defaultdict(set)

    for row in test_df.itertuples():
        if row.rating >= threshold:
            ground_truth[row.user_id].add(row.movie_id)

    return ground_truth

ground_truth = build_ground_truth(test_df)

# 0. Content-Based Filtering

In [None]:
# Wrapper for Content-Based Filtering (per user)
def content_based_recommend(user_id, user_item_matrix, similarity_df, top_n=10):
    """
    Returns top_n recommended movie IDs for a user based on content similarity
    Only works if the user exists in user_item_matrix (train_df)
    """
    if user_id not in user_item_matrix.index:
        return []  # user not in training set

    user_ratings = user_item_matrix.loc[user_id]
    scores = defaultdict(float)
    
    for movie_id, rating in user_ratings.items():
        if rating > 0:
            sim_scores = similarity_df[movie_id]
            for other_movie_id, sim in sim_scores.items():
                if other_movie_id not in user_ratings.index or user_ratings[other_movie_id] == 0:
                    scores[other_movie_id] += sim * rating

    recommended_movies = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    top_movies = [movie_id for movie_id, _ in recommended_movies[:top_n]]
    return top_movies

# Example usage
user_id = 1
top_n = 10
cbf_recommendations = content_based_recommend(user_id, user_item_matrix, similarity_df, top_n=top_n)
print(f"Top-{top_n} Content-Based Recommendations for User {user_id}: {cbf_recommendations}")

# 1. User-Based Collaborative Filtering

In [None]:
# Compute User Similarity
# Cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)

# DataFrame for easy lookup
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

# Wrapper for User-Based Filtering (per user)
def user_cf_recommend_wrapper(user_id, user_item_matrix, user_similarity_df, top_n=10):
    """
    Returns top_n recommended movie IDs for a given user
    using User-Based Collaborative Filtering.
    """
    # Similarity scores for target user
    similar_users = user_similarity_df.loc[user_id]

    # Compute weighted ratings
    weighted_ratings = np.dot(similar_users, user_item_matrix) / similar_users.sum()

    # Build recommendations DataFrame
    recommendations = pd.DataFrame({
        'movie_id': user_item_matrix.columns,
        'score': weighted_ratings
    }).sort_values(by='score', ascending=False)

    # Exclude already rated movies
    user_rated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendations = recommendations[~recommendations['movie_id'].isin(user_rated_movies)]

    # Return top_n movie IDs (evaluation-ready)
    return recommendations['movie_id'].head(top_n).tolist()

# Show movie titles
def user_cf_recommend_demo(user_id, user_item_matrix, user_similarity_df, movies, top_n=10):
    """
    Returns top_n recommended movie titles for demonstration.
    """
    movie_ids = user_cf_recommend_wrapper(user_id, user_item_matrix, user_similarity_df, top_n)
    return movies[movies['movie_id'].isin(movie_ids)][['movie_id','title']]

# Example Usage
user_id = 1
user_recommendations_demo = user_cf_recommend_demo(user_id, user_item_matrix, user_similarity_df, movies)
print("User-Based Recommendations (Titles):")
print(user_recommendations_demo)

# 2. Item-Based Collaborative Filtering

In [None]:
# Transpose user-item matrix to item-user matrix
item_user_matrix = user_item_matrix.T

# Compute cosine similarity between items
item_similarity = cosine_similarity(item_user_matrix)

# Wrap it in a DataFrame for easy access
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=item_user_matrix.index,
    columns=item_user_matrix.index
)

# print(item_similarity_df.head())

# Wrapper for Item-Based Filtering (per user)
def item_cf_recommend(user_id, user_item_matrix, item_similarity_df, top_n=10):
    """
    Returns top_n recommended movie IDs for a user using Item-Based CF.
    Excludes movies already rated by the user.
    """
    # User's ratings
    user_ratings = user_item_matrix.loc[user_id]

    # Compute item-based scores
    scores = np.dot(user_ratings, item_similarity_df) / np.array([np.abs(item_similarity_df).sum(axis=1)])
    scores = scores.flatten()

    # Create DataFrame with movie scores
    recommendations = pd.DataFrame({
        'movie_id': user_item_matrix.columns,
        'score': scores
    }).sort_values(by='score', ascending=False)

    # Exclude already rated movies
    user_rated_movies = user_ratings[user_ratings > 0].index
    recommendations = recommendations[~recommendations['movie_id'].isin(user_rated_movies)]

    # Return top-N movie IDs only (for evaluation)
    return recommendations['movie_id'].head(top_n).tolist()

# Example usage
user_id = 1
top_n = 10
item_recommendations = item_cf_recommend(user_id, user_item_matrix, item_similarity_df, top_n=top_n)

print(f"Top-{top_n} Item-Based CF Recommendations for User {user_id}:")
print(item_recommendations)

# 3. Matrix Factorization (e.g., SVD) Collaborative Filtering

In [None]:
# Prepare User-Item Matrix
# Ensure all ratings are numeric and fill missing values with 0
user_item_matrix = user_item_matrix.apply(pd.to_numeric, errors='coerce').fillna(0)

# Apply Truncated SVD
n_components = 20  # Number of latent features
svd = TruncatedSVD(n_components=n_components, random_state=42)

# Compute latent factors
user_features = svd.fit_transform(user_item_matrix)  # shape: (n_users, n_components)
item_features = svd.components_                       # shape: (n_components, n_items)

# Reconstruct the Approximate Ratings
reconstructed_matrix = np.dot(user_features, item_features)

# Wrap as DataFrame for easy access
predicted_ratings = pd.DataFrame(
    reconstructed_matrix,
    index=user_item_matrix.index,
    columns=user_item_matrix.columns
)

# Wrapper for SVD Filtering (per user)
def svd_recommend_wrapper(user_id, predicted_ratings, user_item_matrix, top_n=10):
    """
    Returns top_n recommended movie IDs for a given user,
    excluding already rated movies.
    """
    # Get predicted ratings for the user
    user_predicted_ratings = predicted_ratings.loc[user_id]

    # Identify already rated movies
    user_rated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index

    # Exclude already rated movies
    recommendations = user_predicted_ratings[~user_predicted_ratings.index.isin(user_rated_movies)]

    # Return top_n movie IDs only
    return recommendations.sort_values(ascending=False).head(top_n).index.tolist()

# Example Usage
user_id = 1
svd_recommendations = svd_recommend_wrapper(user_id, predicted_ratings, user_item_matrix, top_n=10)
print(f"SVD Recommendations for User {user_id}:")
print(svd_recommendations)

# 4. Deep Learning-Based Recommendation: Autoencoders for Collaborative Filtering

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
print("TensorFlow version:", tf.__version__)

In [None]:
# Normalize the Train/Test user-item matrix (row-wise)
scaler = StandardScaler()

normalized_train = scaler.fit_transform(user_item_matrix)
normalized_train = pd.DataFrame(normalized_train, index=user_item_matrix.index, columns=user_item_matrix.columns)

# Build test user-item matrix (same shape as train, fill missing with 0)
user_item_matrix_test = test_df.pivot(index='user_id', columns='movie_id', values='rating').reindex(
    index=user_item_matrix.index, columns=user_item_matrix.columns, fill_value=0
)

normalized_test = scaler.transform(user_item_matrix_test)
normalized_test = pd.DataFrame(normalized_test, index=user_item_matrix.index, columns=user_item_matrix.columns)


# Define Autoencoder Architecture
n_movies = user_item_matrix.shape[1]

autoencoder = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(n_movies,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(n_movies, activation='sigmoid')
])

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.summary()

history = autoencoder.fit(
    normalized_train.values, normalized_train.values,
    epochs=20,
    batch_size=32,
    validation_data=(normalized_test.values, normalized_test.values),
    verbose=1
)

# Predict reconstructed user-item matrix
reconstructed_matrix = autoencoder.predict(normalized_train.values)
predicted_ratings = pd.DataFrame(reconstructed_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)

# Wrapper for Autoencoder (per user)
def autoencoder_recommend_wrapper(user_id, predicted_ratings, user_item_matrix, top_n=10):
    """
    Returns top_n recommended movie IDs for a given user,
    excluding already rated movies.
    """
    user_predicted_ratings = predicted_ratings.loc[user_id]
    user_rated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendations = user_predicted_ratings[~user_predicted_ratings.index.isin(user_rated_movies)]
    return recommendations.sort_values(ascending=False).head(top_n).index.tolist()

# Example usage
top_movies = autoencoder_recommend_wrapper(1, predicted_ratings, user_item_matrix, top_n=10)
print("Autoencoder Top-10 Recommendations (movie IDs):")
print(top_movies)

# Evaluation Results

#### Top-N Metrics

In [None]:
def precision_at_k(recommended, relevant, k):
    recommended = recommended[:k]
    if not recommended:
        return 0.0
    return len(set(recommended) & relevant) / k

def recall_at_k(recommended, relevant, k):
    recommended = recommended[:k]
    if not relevant:
        return 0.0
    return len(set(recommended) & relevant) / len(relevant)

def average_precision_at_k(recommended, relevant, k):
    score = 0.0
    hits = 0

    for i, item in enumerate(recommended[:k]):
        if item in relevant:
            hits += 1
            score += hits / (i + 1)

    return score / min(len(relevant), k) if relevant else 0.0

def ndcg_at_k(recommended, relevant, k):
    dcg = 0.0
    for i, item in enumerate(recommended[:k]):
        if item in relevant:
            dcg += 1 / np.log2(i + 2)

    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(relevant), k)))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def hit_rate_at_k(recommended, relevant, k):
    return int(len(set(recommended[:k]) & relevant) > 0)

#### Evaluation Loop

In [None]:
def evaluate_model(recommender_fn, ground_truth, k=10):
    precisions, recalls, maps, ndcgs, hits = [], [], [], [], []

    for user_id, relevant_items in ground_truth.items():
        recommended_items = recommender_fn(user_id)

        precisions.append(precision_at_k(recommended_items, relevant_items, k))
        recalls.append(recall_at_k(recommended_items, relevant_items, k))
        maps.append(average_precision_at_k(recommended_items, relevant_items, k))
        ndcgs.append(ndcg_at_k(recommended_items, relevant_items, k))
        hits.append(hit_rate_at_k(recommended_items, relevant_items, k))

    return {
        "Precision@K": np.mean(precisions),
        "Recall@K": np.mean(recalls),
        "MAP@K": np.mean(maps),
        "NDCG@K": np.mean(ndcgs),
        "HitRate@K": np.mean(hits),
    }

#### Run Comparison Across Models

In [None]:
results = {}

# Item based CF
item_cf = ?(train_df, top_n=10)
results["item_cf"] = evaluate_model(item_cf_recommender, ground_truth, k=10)

# SVD CF
item_cf = ?(train_df, top_n=10)
results["item_cf"] = evaluate_model(item_cf_recommender, ground_truth, k=10)


#Autoencoder
item_cf = ?(train_df, top_n=10)
results["item_cf"] = evaluate_model(item_cf_recommender, ground_truth, k=10)

results

In [None]:
results_df = pd.DataFrame(results).T

results_df = pd.DataFrame(results).T
results_df