Reference: https://surpriselib.com/

In [1]:
import os

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, train_test_split, KFold
from surprise import accuracy

import random
import optuna
import time

import pickle

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
df_ratings = pd.read_csv(os.path.join('data', 'prepared_ratings.csv'))
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062183 entries, 0 to 18062182
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   userId  int64  
 2   rating  float64
 3   date    object 
dtypes: float64(1), int64(2), object(1)
memory usage: 551.2+ MB


1. Select Specific Columns Only

In [3]:
df_ratings = df_ratings[['id', 'userId', 'rating']]
df_ratings.head()

Unnamed: 0,id,userId,rating
0,862,8,4.0
1,862,9,4.5
2,862,12,4.0
3,862,20,4.0
4,862,24,4.0


2. Create Surprise Dataset

In [4]:
# Define the rating scale for Surpise
min_rating = df_ratings['rating'].min()
max_rating = df_ratings['rating'].max()
reader = Reader(rating_scale=(min_rating, max_rating))

# Load data from the filtered pandas DataFrame
data = Dataset.load_from_df(df_ratings[['userId', 'id', 'rating']], reader)

3. Split Dataset

In [5]:
RANDOM_STATE = 42
TRAIN_SPLIT_SIZE = 0.80 # 80% for the training set

In [6]:
raw_ratings = data.raw_ratings
random.seed(RANDOM_STATE)  # Initialize the random number generator
random.shuffle(raw_ratings)  # Shuffle the raw_ratings list
threshold = int(TRAIN_SPLIT_SIZE * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

data.raw_ratings = A_raw_ratings  # data is now the set A

4. Optuna

In [7]:
OPTUNA_TRIALS = 20 # Number of Optuna trials (adjust as needed, more is better but slower)
CV_FOLDS_OPTUNA = 3 # Number of folds for cross-validation within Optuna (fewer for speed)

In [8]:
# Define the objective function for Optuna
def objective(trial):
    """Optuna objective function to minimize RMSE using cross-validation on the training set."""
    # Define the search space for SVD hyperparameters
    params = {
        'n_factors': trial.suggest_int('n_factors', 20, 150),
        'n_epochs': trial.suggest_int('n_epochs', 5, 20),
        'lr_all': trial.suggest_float('lr_all', 0.002, 0.01, log=True),
        'reg_all': trial.suggest_float('reg_all', 0.02, 0.2, log=True)
    }
    algo = SVD(**params, random_state=RANDOM_STATE)

    # Perform cross-validation on the training set
    cv = KFold(n_splits=CV_FOLDS_OPTUNA, random_state=RANDOM_STATE, shuffle=True)
    results = cross_validate(algo, data, measures=['rmse'], cv=cv, verbose=False)

    return results['test_rmse'].mean()

# Create study and optimize
study = optuna.create_study(direction='minimize')
start_time = time.time()
# Set show_progress_bar=False if it causes issues in your environment
try:
    study.optimize(objective, n_trials=OPTUNA_TRIALS, show_progress_bar=True)
except TypeError: 
    print("Progress bar not supported in this Optuna version/environment. Running without it.")
    study.optimize(objective, n_trials=OPTUNA_TRIALS)
tuning_time = time.time() - start_time

print(f"\nOptuna tuning finished in {tuning_time:.2f} seconds.")
print("Best hyperparameters found: ", study.best_params)
print("Best cross-validation RMSE during tuning (on train data): ", study.best_value)

[I 2025-10-31 14:16:56,269] A new study created in memory with name: no-name-decbcc4f-e33b-4d5a-98a5-6899bca3b0e5


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-10-31 14:31:20,816] Trial 0 finished with value: 0.8364562227156815 and parameters: {'n_factors': 73, 'n_epochs': 14, 'lr_all': 0.006378548706894127, 'reg_all': 0.02343407820816524}. Best is trial 0 with value: 0.8364562227156815.
[I 2025-10-31 14:50:45,352] Trial 1 finished with value: 0.884754473684264 and parameters: {'n_factors': 150, 'n_epochs': 17, 'lr_all': 0.003490532624319085, 'reg_all': 0.17618250337019012}. Best is trial 0 with value: 0.8364562227156815.
[I 2025-10-31 15:00:19,588] Trial 2 finished with value: 0.8869299051929974 and parameters: {'n_factors': 92, 'n_epochs': 6, 'lr_all': 0.004330019505106004, 'reg_all': 0.04282512195318332}. Best is trial 0 with value: 0.8364562227156815.
[I 2025-10-31 15:10:33,476] Trial 3 finished with value: 0.8823699216918568 and parameters: {'n_factors': 126, 'n_epochs': 8, 'lr_all': 0.006785371787055469, 'reg_all': 0.08457179441459471}. Best is trial 0 with value: 0.8364562227156815.
[I 2025-10-31 16:19:08,617] Trial 4 finished 

In [9]:
print("Best hyperparameters found: ", study.best_params)
print("Best cross-validation RMSE during tuning (on train data): ", study.best_value)

Best hyperparameters found:  {'n_factors': 64, 'n_epochs': 17, 'lr_all': 0.008003611447873683, 'reg_all': 0.029269316641158172}
Best cross-validation RMSE during tuning (on train data):  0.8287785228953602


5. Fitting with Best and Base Model Parameters

In [10]:
trainset = data.build_full_trainset()

In [11]:
# Train the final model with the best parameters on the full training set
best_params = {'n_factors': 85, 'n_epochs': 18, 'lr_all': 0.0070743582784896395, 'reg_all': 0.040882549609105694}
best_algo = SVD(**best_params, random_state=RANDOM_STATE)
best_algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1d59e3cd210>

In [12]:
# For comparison, train a baseline model with default parameters
base_algo = SVD(random_state=RANDOM_STATE)
base_algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1d5a968c050>

6. Save models to folder

In [13]:
with open('data/models/best_SVD_model.pkl', 'wb') as f:
    pickle.dump(best_algo, f)

with open('data/models/base_SVD_model.pkl', 'wb') as f:
    pickle.dump(base_algo, f)