In [69]:
import boto3
import botocore
import json
import s3fs
import ast
import pandas as pd
from pandas.io.json import json_normalize
from sagemaker import get_execution_role
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from surprise import Reader,Dataset,SVD,BaselineOnly,CoClustering
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from collections import defaultdict

In [2]:
with open('users_items.json') as f:
    lines = f.read().splitlines()

In [3]:
lst = [ast.literal_eval(line) for line in lines]

In [6]:
item_ratings = pd.json_normalize(data= lst, record_path = 'items', meta = ['user_id'])

In [7]:
item_ratings.head()

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id
0,10,Counter-Strike,6,0,76561197970982479
1,20,Team Fortress Classic,0,0,76561197970982479
2,30,Day of Defeat,7,0,76561197970982479
3,40,Deathmatch Classic,0,0,76561197970982479
4,50,Half-Life: Opposing Force,0,0,76561197970982479


In [14]:
reader = Reader()

In [15]:
data = Dataset.load_from_df(item_ratings[['user_id','item_id','playtime_forever']],reader)

In [20]:
trainset, testset = train_test_split(data, test_size = 0.20)

Using SVD algorithm 

In [42]:
svd = SVD()

In [43]:
svd_pred =svd.fit(trainset).test(testset)


In [44]:
# code borrowed from surprise
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [47]:
get_top_n(svd_pred)

defaultdict(list,
            {'76561198048692161': [('224260', 5),
              ('9900', 5),
              ('222880', 5),
              ('200510', 5),
              ('304930', 5),
              ('56400', 5),
              ('4560', 5),
              ('8190', 5),
              ('12710', 5),
              ('4920', 5)],
             '76561198068755389': [('94400', 5), ('80', 5), ('215470', 5)],
             '76561198073259913': [('449540', 5),
              ('291550', 5),
              ('304930', 5),
              ('201790', 5),
              ('219740', 5),
              ('360870', 5),
              ('34270', 5),
              ('346900', 5)],
             '76561198083313051': [('6060', 5),
              ('32470', 5),
              ('244810', 5),
              ('240', 5),
              ('208090', 5),
              ('2400', 5),
              ('42160', 5),
              ('303550', 5),
              ('8980', 5),
              ('327890', 5)],
             'mendicant_bias': [('63710', 5),
    

In [48]:
# code borrowed from surprise
from surprise.model_selection import KFold


def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls


In [51]:
precisions, recalls = precision_recall_at_k(svd_pred)

In [54]:
precisions

{'76561198048692161': 1.0,
 '76561198068755389': 1.0,
 '76561198073259913': 0.625,
 '76561198083313051': 0.5,
 'mendicant_bias': 0.5,
 '76561198045020988': 0.5714285714285714,
 '76561198075333896': 0.8,
 '13lazer': 0.6,
 '76561197982807200': 0.5,
 '76561198060259445': 0.4,
 'nanakao': 0.5,
 '76561198034506282': 0.4,
 '76561198037077494': 0.7,
 '2571': 1.0,
 '76561198078377030': 0.375,
 '76561198090260579': 0.6,
 'thebluescarecrow1': 1.0,
 '76561198006530854': 0.2,
 'simmer4449': 0.8,
 '76561198077669692': 0.8,
 '76561198027292865': 0.8,
 'mazokn': 0.9,
 'MATTBOT12': 0.7,
 '76561198066450058': 0.7777777777777778,
 'Pika_Chan_Oat_11002': 0.6,
 '76561198059685385': 0.375,
 'Osiris5622': 0.6,
 '76561198034392042': 0.7,
 '76561198073975526': 0.7,
 'twunka': 0.4,
 '76561198035546541': 0.5,
 '76561198020280754': 0.4,
 '76561198038293072': 0.5,
 'gubigubbins': 0.5,
 '76561198071316761': 0.6,
 'dekiller112': 0.5,
 'Vaalguard': 0.8,
 'chocklanisi': 1.0,
 '76561198032576956': 1.0,
 '7656119806230

Average Precision for SVD

In [61]:
sum([v for v in precisions.values()])/len(precisions)

0.6542267118806763

Using CoClustering algorithm

In [70]:
cc = CoClustering()
cc_pred =cc.fit(trainset).test(testset)

In [71]:
get_top_n(cc_pred)

defaultdict(list,
            {'76561198048692161': [('224260', 5),
              ('9900', 5),
              ('222880', 5),
              ('200510', 5),
              ('304930', 5),
              ('56400', 5),
              ('4560', 5),
              ('8190', 5),
              ('12710', 5),
              ('4920', 5)],
             '76561198068755389': [('94400', 5), ('80', 5), ('215470', 5)],
             '76561198073259913': [('449540', 5),
              ('291550', 5),
              ('304930', 5),
              ('201790', 5),
              ('219740', 5),
              ('360870', 5),
              ('34270', 5),
              ('346900', 5)],
             '76561198083313051': [('240', 5),
              ('8980', 5),
              ('49520', 5),
              ('105600', 5),
              ('437220', 5),
              ('345610', 5),
              ('334230', 5),
              ('6060', 1),
              ('32470', 1),
              ('244810', 1)],
             'mendicant_bias': [('63710', 5),
  

In [72]:
precisions1, recalls1 = precision_recall_at_k(cc_pred)

Average Precision for CoClustering

In [73]:
sum([v for v in precisions1.values()])/len(precisions1)

0.7100886561230149