# Surprise homework

In [1]:
import io 
from collections import defaultdict
import pandas as pd

from surprise import Dataset
from surprise import SVD
from surprise import NormalPredictor
from surprise import KNNWithMeans
from surprise import accuracy
from surprise.model_selection import cross_validate

from surprise.model_selection import train_test_split


### Functions

In [2]:
# используйте полезные функции из FAQ
def precision_recall_at_k(predictions, k=5, threshold=3.52):
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    

    return pd.Series(precisions).mean(), pd.Series(recalls).mean()

def get_top_n(predictions, n=5):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


### Load data

In [3]:
data = Dataset.load_builtin('ml-100k')

### Define algorithms

In [4]:
# внимательно изучите документацию по метрикам и алгоритмам
algoNormal = NormalPredictor()
#Cos
sim_options = {'name': 'cosine',
               'user_based': True
               }
algoKnnCos = KNNWithMeans(sim_options=sim_options)

#MSD
sim_options = {'name': 'msd',
               'user_based': True
               }
algoKnnMSD = KNNWithMeans(sim_options=sim_options)

#Pearson
sim_options = {'name': 'pearson',
               'user_based': True
               }
algoKnnPearson = KNNWithMeans(sim_options=sim_options)

#SVD
algoSVD = SVD()

### Select algorithm

In [5]:
cross_validate(algoNormal, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5399  1.5324  1.5178  1.5340  1.5175  1.5283  0.0090  
Fit time          0.14    0.17    0.20    0.19    0.17    0.17    0.02    
Test time         0.18    0.17    0.19    0.18    0.17    0.18    0.01    


{'test_rmse': array([1.53985301, 1.53235122, 1.51776678, 1.5340468 , 1.51754645]),
 'fit_time': (0.13844847679138184,
  0.16936731338500977,
  0.1979990005493164,
  0.1867048740386963,
  0.17349696159362793),
 'test_time': (0.17641520500183105,
  0.1653914451599121,
  0.19125938415527344,
  0.17508459091186523,
  0.1706838607788086)}

In [6]:
cross_validate(algoKnnCos, data, measures=['RMSE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9556  0.9575  0.9589  0.9531  0.9536  0.9557  0.0022  
Fit time          2.23    3.63    3.17    3.20    3.33    3.11    0.47    
Test time         8.22    7.71    7.88    7.97    8.20    7.99    0.19    


{'test_rmse': array([0.95562738, 0.95750342, 0.95889793, 0.95308436, 0.9536051 ]),
 'fit_time': (2.22798752784729,
  3.6258299350738525,
  3.171563148498535,
  3.20062255859375,
  3.3251795768737793),
 'test_time': (8.218400001525879,
  7.708191633224487,
  7.8779847621917725,
  7.968299865722656,
  8.201122045516968)}

In [7]:
cross_validate(algoKnnMSD, data, measures=['RMSE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9494  0.9494  0.9614  0.9454  0.9511  0.9513  0.0053  
Fit time          1.15    1.23    1.09    1.08    1.19    1.15    0.06    
Test time         8.60    8.43    8.25    7.87    8.20    8.27    0.25    


{'test_rmse': array([0.94943043, 0.94939631, 0.96135364, 0.94543624, 0.95107834]),
 'fit_time': (1.1484074592590332,
  1.2319283485412598,
  1.0875415802001953,
  1.0835304260253906,
  1.18617582321167),
 'test_time': (8.602067708969116,
  8.427074909210205,
  8.248584985733032,
  7.867818832397461,
  8.2032790184021)}

In [8]:
cross_validate(algoKnnPearson, data, measures=['RMSE'], cv=5, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9553  0.9507  0.9558  0.9521  0.9374  0.9503  0.0067  
Fit time          3.85    4.05    4.41    4.03    4.07    4.08    0.18    
Test time         8.04    7.76    8.06    7.69    7.64    7.84    0.18    


{'test_rmse': array([0.95526425, 0.95069381, 0.95582841, 0.95207374, 0.93743979]),
 'fit_time': (3.8511600494384766,
  4.046712398529053,
  4.405938386917114,
  4.030872821807861,
  4.071309566497803),
 'test_time': (8.044228315353394,
  7.755129814147949,
  8.062565088272095,
  7.6933817863464355,
  7.636591196060181)}

In [9]:
cross_validate(algoSVD, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9369  0.9346  0.9382  0.9342  0.9357  0.9359  0.0015  
Fit time          9.33    9.61    9.43    9.32    9.42    9.42    0.10    
Test time         0.36    0.28    0.30    0.39    0.29    0.33    0.04    


{'test_rmse': array([0.93689259, 0.9345964 , 0.93821188, 0.93423274, 0.93573686]),
 'fit_time': (9.327729225158691,
  9.60753083229065,
  9.427449941635132,
  9.322168827056885,
  9.416839361190796),
 'test_time': (0.3629882335662842,
  0.2845344543457031,
  0.30234718322753906,
  0.39380502700805664,
  0.291412353515625)}

### Calculate precision@k and recall@k

In [20]:
trainset, testset = train_test_split(data, test_size=.25)
algoSVD.fit(trainset)
predictions = algoSVD.test(testset)

print(precision_recall_at_k(predictions))

(0.7485138004246286, 0.36809636393635464)


### Predict

In [21]:
variant = '12'
user_prediction = get_top_n(predictions)[variant]


In [12]:
item_path = '/Users/Maxim/.surprise_data/ml-100k/ml-100k/u.item'
item_df = pd.read_csv(item_path, sep='|',encoding='ISO-8859-1', header = None, index_col = 0)
item_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,14,15,16,17,18,19,20,21,22,23
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [22]:
print("User " + variant)
for item in user_prediction:
    film_name = item_df.loc[int(item[0])][1]
    film_date = item_df.loc[int(item[0])][2]
    print(item[0] + " ('" + film_name + "', '" + film_date + "') " + str(round(item[1],3)))
    

User 12
318 ('Schindler's List (1993)', '01-Jan-1993') 5
480 ('North by Northwest (1959)', '01-Jan-1959') 4.879
15 ('Mr. Holland's Opus (1995)', '29-Jan-1996') 4.542
204 ('Back to the Future (1985)', '01-Jan-1985') 4.281
300 ('Air Force One (1997)', '01-Jan-1997') 4.263
