# Surprise SVD
To benchmark our manual SVD implementation, we'll train the SVD model from `surprise` on our data.

In [1]:
%%capture
import sys
import os

# Add project root to Python path
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)
# import packages
from utils.imports import *
# import user-defined funcs and classes
from utils.helpers import plot_heatmap
from utils.helpers import get_top_n
from utils.helpers import top_n_coverage

In [2]:
# import pandas dataframes
with open("../data/dataframes.pkl", "rb") as f:
    data = pickle.load(f)

train = data["train"]
validation = data["validation"]
baseline = data["baseline"]

# load sparse matrix
ui_csr = load_npz("../data/ui_csr.npz")

# load encodings
with open("../artifacts/user_encoder.pkl", "rb") as f:
    user_encoder = pickle.load(f)
with open("../artifacts/item_encoder.pkl", "rb") as f:
    item_encoder = pickle.load(f)
with open("../artifacts/user_map.pkl", "rb") as f:
    user_map = pickle.load(f)
with open("../artifacts/item_map.pkl", "rb") as f:
    item_map = pickle.load(f)

In [3]:
# import surprise tools
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, GridSearchCV

In [4]:
# create reader
reader = Reader(rating_scale=(0, 5))
# generate in surprise data
train_ds = Dataset.load_from_df(
    train[['user_idx', "item_idx", "review_overall"]], reader)
trainset = train_ds.build_full_trainset()

In [5]:
# set parameter grid
param_grid = {"reg_all": [0.001, 0.02, 0.1], "n_factors": [10, 25, 50, 75]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3, n_jobs=-1, joblib_verbose=3)

In [6]:
gs.fit(train_ds)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  34 out of  36 | elapsed:   42.0s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   43.2s finished


In [7]:
gs.best_params['rmse'], gs.best_score['rmse'], gs.best_params['mae'], gs.best_score['mae']

({'reg_all': 0.02, 'n_factors': 10},
 0.5937020914525424,
 {'reg_all': 0.02, 'n_factors': 10},
 0.44486066939556607)

In [8]:
#fit model
surprise_SVD = SVD(reg_all=0.02, n_factors=10)

In [9]:
surprise_SVD.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12bf086e0>

In [10]:
valset = list(zip(validation['user_idx'],
                  validation['item_idx'],
                  validation['review_overall']))

In [11]:
preds = surprise_SVD.test(valset)
rmse = accuracy.rmse(preds, verbose=False)
mae  = accuracy.mae(preds,  verbose=False)
print(f"RMSE={rmse:.4f}, MAE={mae:.4f}")

RMSE=0.6919, MAE=0.5165


In [12]:
print(f'Top 10 item catalog coverage with k=10, reg=0.02 is {top_n_coverage(surprise_SVD, trainset, N=10)}')

Top 10 item catalog coverage with k=10, reg=0.02 is 0.15042649658034274


In [13]:
test_model = SVD(n_factors = 50)
test_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12b4dd880>

In [14]:
preds = test_model.test(valset)
rmse = accuracy.rmse(preds, verbose=False)
mae  = accuracy.mae(preds,  verbose=False)
print(f"RMSE={rmse:.4f}, MAE={mae:.4f}")

RMSE=0.6926, MAE=0.5165


In [16]:
print(f'Top 10 item catalog coverage with k=50, reg=0.02 is {top_n_coverage(test_model, trainset, N=10)}')

Top 10 item catalog coverage with k=50, reg=0.02 is 0.6530008453085376
