# Model based collaborative system

In [12]:
# import core modules
import pandas as pd 
import numpy as np
from surprise import Reader, Dataset
from surprise import SVD
from surprise.accuracy import rmse, mae
from collections import defaultdict
import matplotlib.pyplot as plt

from IPython.display import Image
# import warnings
# warnings.filterwarnings('ignore')

In [2]:
# rating dataset
ratings = pd.read_csv ('data_movie/ratings.csv')

# movies dataset
movies = pd.read_csv('data_movie/movies.csv')

## Pre-processamento

In [3]:
dict_movies = {}
for index, row in movies.iterrows():
    dict_movies[row['movieId']] = (row['title'], row['genres'])

In [4]:
# adding corresponded data columns
new_cols = np.zeros((ratings.shape[0] , 2)).astype('object')

for index, row in ratings.iterrows():
    new_cols[index][0] = dict_movies[int(row['movieId'])][0]
    new_cols[index][1] = dict_movies[int(row['movieId'])][1]

In [5]:
# final dataframe
aux = np.append(ratings.to_numpy(),new_cols, axis=1)
movielens_dataframe = pd.DataFrame(aux, columns = ['userId', 'movieId', 'rating', 'timestamp','title', 'genres'])
movielens_dataframe.drop(['timestamp'], axis=1, inplace=True)

In [6]:
movielens_dataframe['userId'] = movielens_dataframe['userId'].astype('int32')
movielens_dataframe['movieId'] = movielens_dataframe['movieId'].astype('int64')
movielens_dataframe['rating'] = movielens_dataframe['rating'].astype('int16')

In [54]:
# number of ratings per user
counts = movielens_dataframe['userId'].value_counts()
counts_median = counts.median()
counts.describe()

count     610.000000
mean      165.304918
std       269.480584
min        20.000000
25%        35.000000
50%        70.500000
75%       168.000000
max      2698.000000
Name: userId, dtype: float64

In [55]:
print(counts_median)

70.5


In [7]:
# matriz de ratings-users
movie_matrix = movielens_dataframe.pivot_table(index='userId', columns='title', values='rating')

## Treinamento e Teste

In [9]:
# surprise dataset
reader = Reader()
dataset = Dataset.load_from_df(movielens_dataframe[['userId', 'movieId', 'rating']], reader)

In [10]:
# train set
train_set = dataset.build_full_trainset()

In [11]:
# svd model
svd = SVD()
svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1754d229630>

In [12]:
# test set
test_set = train_set.build_anti_testset()

In [13]:
def get_top_n(pred, n=10):
    #Return the top-N recommendation for each user from a set of predictions.
    
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in pred:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [14]:
# predictions
pred = svd.test(test_set)

In [15]:
top_n = get_top_n(pred, n=5)

In [None]:
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [17]:
# metrics
results = (rmse(pred),mae(pred))

RMSE: 0.5126
MAE:  0.3972


In [None]:
# ir na tabela de filmes e retornar nome do filme e gênero


In [None]:
# mostrar que a base está balanceada


In [None]:
# talvez mexer nos parâmetros do SVD()
