# Surprise homework

In [8]:
import io 
import pandas as pd
from collections import defaultdict
from surprise import Dataset
from surprise import NormalPredictor
from surprise import SVD
from surprise import KNNBasic
from surprise import get_dataset_dir
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split


In [2]:
data = Dataset.load_builtin('ml-100k')
K = 30
trainset, testset = train_test_split(data, test_size=.25)

In [3]:
algorithms = {
    "NP": NormalPredictor(),
    "KNN_cos": KNNBasic(k = K, sim_options = { 'name': 'cosine' }),
    "KNN_MSD": KNNBasic(k = K),
    "KNN_Pearson": KNNBasic(k = K, sim_options = { 'name': 'pearson' }),
    "SVD" : SVD()
}
RSMA = {}

In [4]:
for [name, algo] in algorithms.items():
    crv = cross_validate(algo, data, measures=['RMSE'], verbose=True)
    RSMA[name] = round(crv['test_rmse'].mean(), 3)


Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5114  1.5182  1.5106  1.5218  1.5249  1.5174  0.0056  
Fit time          0.11    0.14    0.15    0.18    0.12    0.14    0.02    
Test time         0.17    0.12    0.15    0.15    0.10    0.14    0.02    
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0243  1.0123  1.0135  1.0193  1.0274  1.0194  0.0059  
Fit time          1.51    1.54    1.61    1.69    1.64 

In [5]:
# найдем лучший алгоритм
print(RSMA)
bestAlgoName = min(RSMA.items(), key=lambda x: x[1])[0]
print(bestAlgoName)
bestAlgo = algorithms[bestAlgoName]
# и его предсказание
bestAlgo.fit(trainset)
predictions = bestAlgo.test(testset)

{'NP': 1.517, 'KNN_cos': 1.019, 'KNN_MSD': 0.977, 'KNN_Pearson': 1.014, 'SVD': 0.937}
SVD


In [9]:
# посчитаем метрики precision@k and recall@k для k=5 и порога отсечения 3.52
# как хорошо, что умные люди уже все сделали за меня: https://github.com/NicolasHug/Surprise/blob/master/examples/precision_recall_at_k.py
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=3.52)

# усредняем по всем пользователям
precision_at_k = sum(prec for prec in precisions.values()) / len(precisions)
recall_at_k = sum(rec for rec in recalls.values()) / len(recalls)
print(precision_at_k)
print(recall_at_k)

0.7389360197949812
0.36699134575915177


In [10]:
USER_INDEX = '23'
N = 5
userPredictions = list(filter(lambda x: x.uid == USER_INDEX, predictions))
top = sorted(userPredictions, key=lambda x: x.est, reverse=True)[0:N]
top

[Prediction(uid='23', iid='512', r_ui=5.0, est=4.512625178373093, details={'was_impossible': False}),
 Prediction(uid='23', iid='98', r_ui=5.0, est=4.455047919503794, details={'was_impossible': False}),
 Prediction(uid='23', iid='357', r_ui=3.0, est=4.4204486183683676, details={'was_impossible': False}),
 Prediction(uid='23', iid='427', r_ui=5.0, est=4.148467921061505, details={'was_impossible': False}),
 Prediction(uid='23', iid='124', r_ui=5.0, est=4.112627740945786, details={'was_impossible': False})]

In [11]:
# достаем инфу о фильмах из
def getInfoForList(l):
    print(l)
    data_path = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    info = {}
    data = pd.read_csv(data_path, sep='|',encoding='ISO-8859-1', header = None) 
    for id in l:
        row = data.iloc[int(id)]
        info[id] = (row[1], row[2])

    return info
info = getInfoForList(list(map(lambda x: x.iid, top)))
print(info)

['512', '98', '357', '427', '124']
{'512': ('Third Man, The (1949)', '01-Jan-1949'), '98': ('Snow White and the Seven Dwarfs (1937)', '01-Jan-1937'), '357': ('Spawn (1997)', '01-Aug-1997'), '427': ('Harold and Maude (1971)', '01-Jan-1971'), '124': ('Phenomenon (1996)', '29-Jun-1996')}


In [12]:
# вывод результата
print('User {}'.format(USER_INDEX))
for pred in top:
    print('{}, {}, {}'.format(pred.iid, info[pred.iid], round(pred.est, 3)))

User 23
512, ('Third Man, The (1949)', '01-Jan-1949'), 4.513
98, ('Snow White and the Seven Dwarfs (1937)', '01-Jan-1937'), 4.455
357, ('Spawn (1997)', '01-Aug-1997'), 4.42
427, ('Harold and Maude (1971)', '01-Jan-1971'), 4.148
124, ('Phenomenon (1996)', '29-Jun-1996'), 4.113
