Creates a GridSearchCV object:

SVD: the algorithm to tune.

param_grid: the hyperparameter combinations.

measures: evaluation metrics (Root Mean Squared Error and Mean Absolute Error).

cv=3: uses 3-fold cross-validation to ensure robustness.

movie 1 million: 18.01

In [5]:
from surprise import SVD
from surprise.model_selection import cross_validate

algo = SVD(n_factors=50, n_epochs=10, lr_all=0.005, reg_all=0.02)

cross_validate(algo, data, measures=["RMSE", "MAE"], cv=3, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9025  0.9052  0.9033  0.9037  0.0011  
MAE (testset)     0.7145  0.7165  0.7151  0.7153  0.0008  
Fit time          15.13   16.56   18.05   16.58   1.19    
Test time         5.68    6.46    4.03    5.39    1.01    


{'test_rmse': array([0.9025202 , 0.90516535, 0.90329275]),
 'test_mae': array([0.71445181, 0.71646528, 0.71508295]),
 'fit_time': (15.132631540298462, 16.556851625442505, 18.04719638824463),
 'test_time': (5.678717613220215, 6.459620475769043, 4.027946472167969)}

In [6]:
from pathlib import Path
import pandas as pd

project_root = Path.cwd().parents[1]  # notebook is in notebooks/learning
data_dir = project_root / "data" / "processed" / "movielens" / "ml-1m"

train_df = pd.read_parquet(data_dir / "train.parquet")
test_df = pd.read_parquet(data_dir / "test.parquet")

print("Train rows:", len(train_df), "Test rows:", len(test_df))


FileNotFoundError: [Errno 2] No such file or directory: '/home/helin/projects/BachelorThesis/code/srcCode/recsys-negative-feedback/data/processed/movielens/ml-1m/train.parquet'

In [None]:
from surprise import Dataset, Reader

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_df[["user_idx", "item_idx", "rating"]], reader)


In [None]:
from surprise import SVD
from surprise.model_selection import cross_validate

algo = SVD(n_factors=50, n_epochs=10, lr_all=0.005, reg_all=0.02)
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=3, verbose=True)


In [None]:
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_factors": [50, 100],
    "n_epochs": [10, 20],
    "lr_all": [0.005, 0.01],
    "reg_all": [0.02, 0.05],
}

gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
gs.fit(data)

best_params = gs.best_params["rmse"]
best_rmse = gs.best_score["rmse"]

best_params, best_rmse


In [None]:
import json

out_dir = project_root / "outputs" / "movielens" / "ml-1m"
out_dir.mkdir(parents=True, exist_ok=True)

with open(out_dir / "best_svd_params.json", "w") as f:
    json.dump(
        {"best_params": best_params, "best_rmse": best_rmse},
        f,
        indent=2
    )

print("Saved:", out_dir / "best_svd_params.json")
