# Modular pipeline demo

In [1]:
import sys
from pathlib import Path

ROOT = Path.cwd().resolve()
if (ROOT / 'src').exists():
    sys.path.insert(0, str(ROOT))
elif (ROOT.parent / 'src').exists():
    sys.path.insert(0, str(ROOT.parent))

In [2]:
from src.data.loading import load_movielens_100k, build_ratings_matrix, build_user_folds
from src.eval.metrics import evaluate_all_metrics
from src.models.baselines import global_mean, user_mean, item_mean, random_baseline, popularity_baseline
from src.models.cf import evaluate_user_based_cf, evaluate_item_based_cf
from src.models.mf import evaluate_mf_with_metrics_on_folds
from src.models.svdpp import evaluate_svdpp_with_metrics_on_folds

Load data, build the ratings matrix, and create 5-fold splits (hiding 5 ratings per user in test).

In [3]:
ratings_df, users_df, movies_df = load_movielens_100k()
ratings_matrix = build_ratings_matrix(ratings_df)
folds = build_user_folds(ratings_matrix, n_splits=5, test_ratings_per_user=5, random_state=42)
train_df, test_df = folds[0]
train_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


Quick baseline comparison on the first fold.

In [4]:
import pandas as pd
R_train, R_test = train_df.values, test_df.values
baselines = [
    ("Global Mean", global_mean(train_df).values),
    ("User Mean", user_mean(train_df).values),
    ("Item Mean", item_mean(train_df).values),
    ("Random [min,max]", random_baseline(train_df).values),
    ("Popularity", popularity_baseline(train_df).values),
]
metrics = []
for name, R_pred in baselines:
    metrics.append({**evaluate_all_metrics(R_train, R_test, R_pred, train_df), "Model": name})
pd.DataFrame(metrics)

Unnamed: 0,RMSE,MAE,Novelty,Relevance,Serendipity,Diversity,nDCG@20,Model
0,1.134653,0.957395,4.737617,3.857143,0.594246,0.856854,0.003504,Global Mean
1,1.048662,0.842154,4.737617,3.857143,0.594246,0.856854,0.003504,User Mean
2,1.027383,0.825723,6.962058,4.388889,0.557099,1.0,0.009034,Item Mean
3,1.794975,1.475437,5.439559,4.0,0.524678,0.917652,0.008416,"Random [min,max]"
4,1.907282,1.618166,1.384603,3.834758,0.195355,0.459004,0.130457,Popularity


For full experiments, call the helper evaluators (they mirror the original notebook logic):

In [5]:
# User-based CF (Pearson):
cf_user_metrics = evaluate_user_based_cf(folds)
# Item-based CF (cosine):
# cf_item_metrics = evaluate_item_based_cf(folds)
# Matrix factorization:
# mf_metrics_df, mf_avg_metrics = evaluate_mf_with_metrics_on_folds(folds)
# SVD++:
# svdpp_metrics_df, svdpp_avg_metrics = evaluate_svdpp_with_metrics_on_folds(folds, n_epochs=5)

In [7]:
cf_user_metrics

Unnamed: 0,RMSE,MAE,Novelty,Relevance,Serendipity,Diversity,nDCG@20,Fold,k_neighbors,Model,Similarity
0,0.968994,0.762809,8.372241,4.0,0.998264,0.893338,0.000519,1,50,Memory-based CF,Pearson (user-based)
1,0.966548,0.761636,8.38409,4.0,0.998264,0.893305,0.000519,1,150,Memory-based CF,Pearson (user-based)
2,0.967531,0.763242,8.385472,4.0,0.998264,0.893213,0.000519,1,500,Memory-based CF,Pearson (user-based)
3,0.967545,0.763282,8.385472,4.0,0.998264,0.893213,0.000519,1,all,Memory-based CF,Pearson (user-based)
4,0.940384,0.738739,8.374753,5.0,0.989565,0.897868,0.000675,2,50,Memory-based CF,Pearson (user-based)
5,0.940738,0.738162,8.388832,5.0,0.989565,0.897868,0.000675,2,150,Memory-based CF,Pearson (user-based)
6,0.941996,0.739276,8.390341,5.0,0.989565,0.897868,0.000675,2,500,Memory-based CF,Pearson (user-based)
7,0.94201,0.739267,8.390341,5.0,0.989565,0.897868,0.000675,2,all,Memory-based CF,Pearson (user-based)
8,0.996752,0.770393,8.369841,4.5,0.998273,0.891329,0.001025,3,50,Memory-based CF,Pearson (user-based)
9,0.993976,0.769726,8.383982,4.5,0.998273,0.891329,0.001025,3,150,Memory-based CF,Pearson (user-based)
