# Collaborative Filtering on the MovieLens-100k Dataset

In [None]:
%pip install scikit-learn==1.0.2
%pip install scikit-surprise==1.1.1

## Loading the Dataset

In [35]:
import os
import io
from pathlib import Path
import zipfile
import requests

cache_dir = Path(".cache")
ds_dir = cache_dir / "ml-100k"
if not cache_dir.exists():
  os.makedirs(cache_dir)
if not ds_dir.exists():
  print("Downloading MovieLens-100k dataset...")
  ds_res = requests.get("https://files.grouplens.org/datasets/movielens/ml-100k.zip")
  ds_stream = io.BytesIO(ds_res.content)
  with zipfile.ZipFile(ds_stream) as z:
    for fname in z.namelist():
      z.extract(fname, cache_dir)
  print("Downloaded dataset.")
else:
  print("Dataset already downloaded.")

Dataset already downloaded.


In [36]:
from surprise import Dataset
from surprise import Reader
import surprise.model_selection.split as ms

# Load the Movielens-100k dataset
reader = Reader(line_format='user item rating timestamp',
                rating_scale=(1, 5),
                sep='\t')
fold_files = [(ds_dir / f"u{i}.base", ds_dir / f"u{i}.test") for i in range(1, 6)]
movielens = Dataset.load_from_folds(fold_files, reader=reader)

In [84]:
import numpy as np
from surprise.model_selection import cross_validate, GridSearchCV

def evaluate_model(model_ctr, name=None):
  if name is None:
    name = getattr(model_ctr, "name", model_ctr.__name__) 
  print(f"Evaluating {name}...")
  res = cross_validate(
    model_ctr(), movielens,
    ["rmse", "mae"],
    ms.PredefinedKFold(),
    n_jobs=-1)
  
  rmse = np.mean(res["test_rmse"])
  mae = np.mean(res["test_mae"])
  rmse_std = np.std(res["test_rmse"])
  mae_std = np.std(res["test_mae"])
  print(f"Completed evaluation of {name}.")
  return rmse, rmse_std, mae, mae_std, np.sum(res["fit_time"] + res["test_time"])


def tune_model(model_ctr, grid, name=None):
  if name is None:
    name = getattr(model_ctr, "name", model_ctr.__name__)
  
  print(f"Tuning hyperparameters for {name}...")
  gs = GridSearchCV(
    model_ctr, grid, 
    ["rmse", "mae"], 
    ms.PredefinedKFold(),
    n_jobs=-1)
  gs.fit(movielens)
  
  best_rmse_params = gs.best_params["rmse"]
  best_mae_params = gs.best_params["mae"]
  best_rmse = gs.best_score["rmse"]
  best_mae = gs.best_score["mae"]
  print(f"Completed tuning hyperparameters of {name}.")
  return best_rmse, best_mae, best_rmse_params, best_mae_params


cos_model = lambda **kwargs: KNNBasic(sim_options=dict(name="cosine"), **kwargs)
pearson_model = lambda **kwargs: KNNWithMeans(sim_options=dict(name="pearson"), **kwargs)
mf_model = lambda **kwargs: SVD(biased=False, **kwargs)

## Evaluation with the Default Parameters

In [85]:
import pandas as pd
from surprise import KNNBasic, KNNWithMeans, SVD
cos_res = evaluate_model(cos_model, "Memory-based Cosine")
corr_res = evaluate_model(pearson_model, "Memory-based Correlation")
mf_res = evaluate_model(mf_model, "Regularised Matrix Factorisation")

results = pd.DataFrame(
  data=[
    ["Memory-based Cosine", *cos_res], 
    ["Memory-based Correlation", *corr_res],
    ["Regularised Matrix Factorisation", *mf_res]],
  columns=["name", "rmse", "rmseStd", "mae", "maeStd", "time"])
results

Evaluating Memory-based Cosine...
Completed evaluation of Memory-based Cosine.
Evaluating Memory-based Correlation...
Completed evaluation of Memory-based Correlation.
Evaluating Regularised Matrix Factorisation...
Completed evaluation of Regularised Matrix Factorisation.


Unnamed: 0,name,rmse,rmseStd,mae,maeStd,time
0,Memory-based Cosine,1.017409,0.004632,0.805276,0.004316,11.809762
1,Memory-based Correlation,0.952484,0.007001,0.745971,0.004676,14.8737
2,Regularised Matrix Factorisation,0.951939,0.007064,0.750433,0.00554,14.584713


## Hyperparameter Tuning

In [91]:
cos_res_cv = tune_model(cos_model, {
  "k": [10, 20, 40, 80, 160],
  "min_k": [1, 5, 10]
}, "Memory-based Cosine")

Tuning hyperparameters for Memory-based Cosine...
Completed tuning hyperparameters of Memory-based Cosine.


In [90]:
corr_res_cv = tune_model(pearson_model, {
  "k": [10, 20, 40, 80, 160],
  "min_k": [1, 5, 10]
}, "Memory-based Correlation")

Tuning hyperparameters for Memory-based Correlation...
Completed tuning hyperparameters of Memory-based Correlation.


In [96]:
mf_res_cv = tune_model(mf_model, {
  "n_factors": [25, 50, 100],
  "n_epochs": [10, 20, 30],
  "lr_all": [0.01, 0.005, 0.001],
  "reg_all": [0.02, 0.04, 0.1]
}, "Regularised Matrix Factorisation")

Tuning hyperparameters for Regularised Matrix Factorisation...
Completed tuning hyperparameters of Regularised Matrix Factorisation.


In [103]:
results_cv = pd.DataFrame(
  data=[
    ["Memory-based Cosine", *cos_res, *cos_res_cv[:2]], 
    ["Memory-based Correlation", *corr_res, *corr_res_cv[:2]],
    ["Regularised Matrix Factorisation", *mf_res, *mf_res_cv[:2]]],
  columns=["name", "rmse", "rmseStd", "mae", "maeStd", "time", "tunedRmse", "tunedMae"])
results_cv.to_csv("exercise05.csv",index=False)
results_cv

Unnamed: 0,name,rmse,rmseStd,mae,maeStd,time,tunedRmse,tunedMae
0,Memory-based Cosine,1.017409,0.004632,0.805276,0.004316,11.809762,1.016982,0.805276
1,Memory-based Correlation,0.952484,0.007001,0.745971,0.004676,14.8737,0.950421,0.744583
2,Regularised Matrix Factorisation,0.951939,0.007064,0.750433,0.00554,14.584713,0.929612,0.731663


In [101]:
print(f"Memory-based Cosine Best Params (RMSE/MSE): {cos_res_cv[2:]}")
print(f"Memory-based Correlation Best Params (RMSE/MSE): {corr_res_cv[2:]}")
print(f"Regularised Matrix Factorisation Best Params (RMSE/MSE): {mf_res_cv[2:]}")

Memory-based Cosine Best Params (RMSE/MSE): ({'k': 80, 'min_k': 1}, {'k': 40, 'min_k': 1})
Memory-based Correlation Best Params (RMSE/MSE): ({'k': 80, 'min_k': 5}, {'k': 80, 'min_k': 1})
Regularised Matrix Factorisation Best Params (RMSE/MSE): ({'n_factors': 25, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.04}, {'n_factors': 25, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.04})
