In [2]:
%load_ext autoreload
%autoreload 2

import logging
import numpy as np
import pandas as pd
import scrapbook as sb
from sklearn.preprocessing import minmax_scale

from recommenders.utils.python_utils import binarize
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import (
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    rmse,
    mae,
    logloss,
    rsquared,
    exp_var
)
from recommenders.models.sar import SAR
import sys

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
System version: 3.7.11 (default, Jul 27 2021, 09:42:29) [MSC v.1916 64 bit (AMD64)]
Pandas version: 1.3.4


In [3]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
AMAZON_DATA_SIZE = '100k'

In [4]:
df = pd.read_csv('ratings_Digital_Music_ordered.csv')

df['rating'] = df['rating'].astype(np.float32)

df.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,A1YS9MDZP93857,6428320,3.0,1394496000
1,A3TS466QBAWB9D,14072149,5.0,1370476800
2,A3BUDYITWUSIS7,41291905,5.0,1381708800
3,A19K10Z0D2NTZK,41913574,5.0,1285200000
4,A14X336IB4JD89,201891859,1.0,1350432000


In [9]:
train, test = python_stratified_split(df, ratio=0.51, col_user='userID', col_item='itemID', seed=42)

In [12]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train['userID'].unique()),
    train_items=len(train['itemID'].unique()),
    test_total=len(test),
    test_users=len(test['userID'].unique()),
    test_items=len(test['itemID'].unique()),
))


Train:
Total Ratings: 988
Unique Users: 988
Unique Items: 58

Test:
Total Ratings: 12
Unique Users: 12
Unique Items: 4



In [13]:
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SAR(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_timestamp="timestamp",
    similarity_type="jaccard", 
    time_decay_coefficient=30, 
    timedecay_formula=True,
    normalize=True
)

In [14]:
with Timer() as train_time:
    model.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

2021-11-28 00:04:48,601 INFO     Collecting user affinity matrix
2021-11-28 00:04:48,608 INFO     Calculating time-decayed affinities
2021-11-28 00:04:48,633 INFO     Creating index columns
2021-11-28 00:04:48,644 INFO     Calculating normalization factors
2021-11-28 00:04:48,670 INFO     Building user affinity sparse matrix
2021-11-28 00:04:48,674 INFO     Calculating item co-occurrence
2021-11-28 00:04:48,685 INFO     Calculating item similarity
2021-11-28 00:04:48,687 INFO     Using jaccard based similarity
2021-11-28 00:04:48,691 INFO     Done training


Took 0.09604339999998501 seconds for training.


In [15]:
with Timer() as test_time:
    top_k = model.recommend_k_items(test, remove_seen=True)

print("Took {} seconds for prediction.".format(test_time.interval))

2021-11-28 00:04:55,800 INFO     Calculating recommendation scores
2021-11-28 00:04:55,806 INFO     Removing seen items


Took 0.026640900000018064 seconds for prediction.


In [16]:
top_k.head()

Unnamed: 0,userID,itemID,prediction
0,A15STV0U0MEAS0,739040375,1.605736e-56
1,A15STV0U0MEAS0,739045067,1.605736e-56
2,A15STV0U0MEAS0,739064525,1.605736e-56
3,A15STV0U0MEAS0,767851013,1.605736e-56
4,A15STV0U0MEAS0,634061801,1.605736e-56


In [17]:
eval_map = map_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', k=TOP_K)

In [18]:
eval_ndcg = ndcg_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', k=TOP_K)

In [19]:
eval_precision = precision_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', k=TOP_K)

In [20]:
eval_recall = recall_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', k=TOP_K)

In [21]:
eval_rmse = rmse(test, top_k, col_user='userID', col_item='itemID', col_rating='rating')

In [22]:
eval_mae = mae(test, top_k, col_user='userID', col_item='itemID', col_rating='rating')

In [23]:
eval_rsquared = rsquared(test, top_k, col_user='userID', col_item='itemID', col_rating='rating')

In [24]:
eval_exp_var = exp_var(test, top_k, col_user='userID', col_item='itemID', col_rating='rating')

In [28]:
print("Model:\t",
      "Top K:\t%d" % TOP_K,
      "MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall,
      "RMSE:\t%f" % eval_rmse,
      "MAE:\t%f" % eval_mae,
      "R2:\t%f" % eval_rsquared,
      "Exp var:\t%f" % eval_exp_var,
#       "Logloss:\t%f" % eval_logloss,
      sep='\n')

Model:	
Top K:	10
MAP:	0.104167
NDCG:	0.119223
Precision@K:	0.016667
Recall@K:	0.166667
RMSE:	4.123106
MAE:	4.000000
R2:	-16.000000
Exp var:	0.000000
