# Running SAR on MovieLens (Single Node)

SAR is a fast scalable adaptive algorithm for personalized recommendations based on user transaction history and item descriptions. It produces easily explainable / interpretable recommendations and handles "cold item" and "semi-cold user" scenarios.

In [39]:
# set the environment path to find Recommenders
import sys
sys.path.append("..")

from utilities.recommender.sar.sar_singlenode import SARSingleNodeReference
from utilities.dataset.url_utils import maybe_download
#TODO from utilities.dataset.splitters_python import
#TODO from utilities.evaluation.python_evaluation import PythonRankingEvaluation, PythonRatingEvaluation

import datetime
import itertools
import pandas as pd

First, let's download the MovieLens dataset:

In [57]:
filepath = maybe_download('http://files.grouplens.org/datasets/movielens/ml-100k/u.data', 'ml-100k.data')
data = pd.read_csv('ml-100k.data', sep='\t', names=['UserId', 'MovieId', 'Rating', 'Timestamp'])

In [58]:
display(data.head())

Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


Next, we split the data using the provided Random Splitter utility function:

In [78]:
## TODO fix the splitter
train = data[:80000]
test = data[80000:]

In [59]:
EPOCH = datetime.datetime.utcfromtimestamp(0)

In [77]:
header = {
        "col_user": "UserId",
        "col_item": "MovieId",
        "col_rating": "Rating",
        "col_timestamp": "Timestamp",
    }

model = SARSingleNodeReference(
                remove_seen=True, similarity_type="jaccard", 
    time_decay_coefficient=30, time_now=None, timedecay_formula=True, **header
            )

In [61]:
unique_users = data[header["col_user"]].unique()
unique_items = data[header["col_item"]].unique()

In [63]:
# Hash users and items to smaller continuous space.
# Actually, this is an ordered set - it's discrete, but contiguous.
# This helps keep the matrices we keep in memory as small as possible.
enumerate_items_1, enumerate_items_2 = itertools.tee(enumerate(unique_items))
enumerate_users_1, enumerate_users_2 = itertools.tee(enumerate(unique_users))
item_map_dict = {x: i for i, x in enumerate_items_1}
user_map_dict = {x: i for i, x in enumerate_users_1}

In [64]:
# the reverse of the dictionary above - array index to actual ID
index2user = dict(enumerate_users_2)
index2item = dict(enumerate_items_2)

In [65]:
# we need to index the train and test sets for SAR matrix operations to work
model.set_index(unique_users, unique_items, user_map_dict, item_map_dict, index2user, index2item)

In [75]:
model.fit(train)
top_k = model.recommend_k_items(test)

INFO:utilities.recommender.sar.sar_singlenode:Collecting user affinity matrix...
INFO:utilities.recommender.sar.sar_singlenode:Deduplicating the user-item counts
INFO:utilities.recommender.sar.sar_singlenode:Creating index columns...
INFO:utilities.recommender.sar.sar_singlenode:Building user affinity sparse matrix...
INFO:utilities.recommender.sar.sar_singlenode:Calculating item cooccurrence...
INFO:utilities.recommender.sar.sar_singlenode:Calculating item similarity...
INFO:utilities.recommender.sar.sar_singlenode:Calculating jaccard...
  return np.true_divide(self.todense(), other)
  return np.true_divide(self.todense(), other)
INFO:utilities.recommender.sar.sar_singlenode:Calculating recommendation scores...
INFO:utilities.recommender.sar.sar_singlenode:done training
INFO:utilities.recommender.sar.sar_singlenode:Converting to dense matrix...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in

In [76]:
display(top_k)

Unnamed: 0,UserId,MovieId,prediction
2358,450,423,327.310061
2359,450,186,315.724112
2357,450,69,315.578446
2356,450,234,308.525880
2353,450,22,300.044321
3637,276,161,299.768093
2354,450,228,298.917211
2355,450,732,290.929073
3639,276,216,289.116409
633,416,4,287.602372
