In [2]:
import sys
import os
import surprise
import papermill as pm
import scrapbook as sb
import pandas as pd

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions

print("System version: {}".format(sys.version))
print("Surprise version: {}".format(surprise.__version__))

System version: 3.7.11 (default, Jul 27 2021, 09:42:29) [MSC v.1916 64 bit (AMD64)]
Surprise version: 1.1.1


In [3]:
AMAZON_DATA_SIZE = '100k'

In [14]:
df = pd.read_csv('ratings_Digital_Music_ordered.csv',
                header=0,
                names=["userID", "itemID", "rating"])

df.head()

Unnamed: 0,userID,itemID,rating
A1YS9MDZP93857,6428320,3.0,1394496000
A3TS466QBAWB9D,14072149,5.0,1370476800
A3BUDYITWUSIS7,41291905,5.0,1381708800
A19K10Z0D2NTZK,41913574,5.0,1285200000
A14X336IB4JD89,201891859,1.0,1350432000


In [15]:
train, test = python_random_split(df, 0.75)

In [16]:
# 'reader' is being used to get rating scale (for MovieLens, the scale is [1, 5]).
# 'rating_scale' parameter can be used instead for the later version of surprise lib:
# https://github.com/NicolasHug/Surprise/blob/master/surprise/dataset.py
train_set = surprise.Dataset.load_from_df(train, reader=surprise.Reader('ml-100k')).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x2404e4d3548>

In [17]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print("Took {} seconds for training.".format(train_time.interval))

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Took 0.4263572000000124 seconds for training.


In [18]:
predictions = predict(svd, test, usercol='userID', itemcol='itemID')
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,767851013,5.0,5
1,1384719342,4.0,5
2,1384719342,3.0,5
3,767851013,5.0,5
4,767851013,5.0,5


In [19]:
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(svd, train, usercol='userID', itemcol='itemID', remove_seen=True)
    
print("Took {} seconds for prediction.".format(test_time.interval))

Took 0.037629900000013095 seconds for prediction.


In [20]:
all_predictions.head()

Unnamed: 0,userID,itemID,prediction
750,739040375,1.0,5
751,739040375,2.0,5
752,739079883,3.0,5
753,577088726,4.0,5
754,577088726,3.0,5


In [21]:
eval_rmse = rmse(test, predictions)
eval_mae = mae(test, predictions)
eval_rsquared = rsquared(test, predictions)
eval_exp_var = exp_var(test, predictions)

k = 10
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)


print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "rsquared:\t%f" % eval_rsquared,
      "exp var:\t%f" % eval_exp_var, sep='\n')

print('----')

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

RMSE:		1243692520.661067
MAE:		1236010528.655006
rsquared:	-80.199344
exp var:	0.000000
----
MAP:	0.143340
NDCG:	0.200175
Precision@K:	0.050000
Recall@K:	0.219017
