# Model based collaborative system

[Top N-primeiros](https://surprise.readthedocs.io/en/stable/FAQ.html#how-to-get-the-top-n-recommendations-for-each-user)

In [21]:
# import core modules
import pandas as pd 
import numpy as np
from surprise import Reader, Dataset
from surprise import SVD
from surprise.accuracy import rmse, mae
from collections import defaultdict

from IPython.display import Image
# import warnings
# warnings.filterwarnings('ignore')

In [3]:
# rating dataset
ratings = pd.read_csv ('data_movie/ratings.csv')

# movies dataset
movies = pd.read_csv('data_movie/movies.csv')

In [4]:
dict_movies = {}
for index, row in movies.iterrows():
    dict_movies[row['movieId']] = (row['title'], row['genres'])

In [5]:
# adding corresponded data columns
new_cols = np.zeros((ratings.shape[0] , 2)).astype('object')

for index, row in ratings.iterrows():
    new_cols[index][0] = dict_movies[int(row['movieId'])][0]
    new_cols[index][1] = dict_movies[int(row['movieId'])][1]

In [6]:
# final dataframe
aux = np.append(ratings.to_numpy(),new_cols, axis=1)
movielens_dataframe = pd.DataFrame(aux, columns = ['userId', 'movieId', 'rating', 'timestamp','title', 'genres'])
movielens_dataframe.drop(['timestamp'], axis=1, inplace=True)

In [7]:
movielens_dataframe['userId'] = movielens_dataframe['userId'].astype('int32')
movielens_dataframe['movieId'] = movielens_dataframe['movieId'].astype('int64')
movielens_dataframe['rating'] = movielens_dataframe['rating'].astype('int16')

In [8]:
# matriz de ratings-users
movie_matrix = movielens_dataframe.pivot_table(index='userId', columns='title', values='rating')

In [9]:
# surprise dataset
reader = Reader()
dataset = Dataset.load_from_df(movielens_dataframe[['userId', 'movieId', 'rating']], reader)

In [10]:
# train set
train_set = dataset.build_full_trainset()

In [11]:
# svd model
svd = SVD()
svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1754d229630>

In [12]:
# test set
test_set = train_set.build_anti_testset()

In [13]:
def get_top_n(pred, n=10):
    #Return the top-N recommendation for each user from a set of predictions.
    
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in pred:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [14]:
# predictions
pred = svd.test(test_set)

In [15]:
top_n = get_top_n(pred, n=5)

In [26]:
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

1 [318, 58559, 112552, 898, 899]
2 [50, 1204, 858, 541, 1217]
3 [119145, 2138, 80906, 4027, 1235]
4 [913, 1148, 1270, 2924, 3328]
5 [4973, 7361, 898, 1201, 922]
6 [1704, 1035, 81834, 898, 1272]
7 [1197, 318, 1221, 1223, 527]
8 [720, 2791, 912, 1228, 2160]
9 [750, 318, 527, 2959, 1089]
10 [720, 5060, 68358, 106642, 246]
11 [2160, 6016, 1235, 899, 168252]
12 [110, 356, 527, 1197, 1208]
13 [898, 1704, 4973, 318, 356]
14 [858, 1204, 1276, 81834, 260]
15 [1394, 1272, 1197, 50, 5060]
16 [1258, 2160, 1237, 246, 475]
17 [4973, 720, 1208, 1148, 1250]
18 [922, 720, 106642, 1242, 3000]
19 [4973, 7153, 858, 318, 1104]
20 [2858, 318, 527, 109374, 2396]
21 [1204, 922, 720, 45722, 319]
22 [2571, 4973, 88163, 2028, 296]
23 [318, 1272, 914, 1267, 1235]
24 [2329, 1204, 750, 2858, 914]
25 [2502, 318, 48516, 750, 858]
26 [260, 858, 318, 898, 750]
27 [899, 318, 2019, 110, 1204]
28 [1204, 930, 2788, 750, 1225]
29 [4973, 1394, 1148, 1193, 1258]
30 [50, 296, 593, 1208, 720]
31 [318, 922, 44195, 898, 1225]
32 

In [17]:
# metrics
results = (rmse(pred),mae(pred))

RMSE: 0.5126
MAE:  0.3972
