# Model based collaborative system

In [19]:
# import core modules
import pandas as pd 
import numpy as np
from surprise import Reader, Dataset
from surprise import SVD
from surprise.accuracy import rmse, mae
from collections import defaultdict
import matplotlib.pyplot as plt

import matplotlib.image as mpimg
from IPython.display import display, Image

# import warnings
# warnings.filterwarnings('ignore')

In [20]:
# rating dataset
ratings = pd.read_csv ('data_movie/ratings.csv')

# movies dataset
movies = pd.read_csv('data_movie/movies.csv')

## Pre-processamento

In [21]:
dict_movies = {}
for index, row in movies.iterrows():
    dict_movies[row['movieId']] = (row['title'], row['genres'])

In [22]:
# adding corresponded data columns
new_cols = np.zeros((ratings.shape[0] , 2)).astype('object')

for index, row in ratings.iterrows():
    new_cols[index][0] = dict_movies[int(row['movieId'])][0]
    new_cols[index][1] = dict_movies[int(row['movieId'])][1]

In [23]:
# final dataframe
aux = np.append(ratings.to_numpy(),new_cols, axis=1)
movielens_dataframe = pd.DataFrame(aux, columns = ['userId', 'movieId', 'rating', 'timestamp','title', 'genres'])
movielens_dataframe.drop(['timestamp'], axis=1, inplace=True)

In [24]:
movielens_dataframe['userId'] = movielens_dataframe['userId'].astype('int32')
movielens_dataframe['movieId'] = movielens_dataframe['movieId'].astype('int64')
movielens_dataframe['rating'] = movielens_dataframe['rating'].astype('float32')

In [None]:
movielens_dataframe.head(15)

In [None]:
# matriz de ratings-users
movie_matrix = movielens_dataframe.pivot_table(index='userId', columns='title', values='rating')
movie_matrix.sample(20)

### Distribuição dos Dados

In [None]:
# number of ratings per user
counts = movielens_dataframe['userId'].value_counts()
counts_median = counts.median()
counts.sample(20)

In [44]:
print("Mediana: "+ str(counts_median))

Mediana: 70.5


## Treinamento e Teste

In [71]:
# surprise dataset
reader = Reader()
dataset = Dataset.load_from_df(movielens_dataframe[['userId', 'movieId', 'rating']], reader)

In [72]:
# train set
train_set = dataset.build_full_trainset()

In [73]:
# test set
test_set = train_set.build_anti_testset()

In [68]:
results_dict = {}
epochs = [5,10,20]
factors = [10,50,100]
reg = [0.2,0.4]

for ep in epochs:
    for fac in factors:
        for r in reg:
            svd = SVD(n_epochs=ep,reg_all=r,n_factors=fac, random_state=0).fit(train_set)
            pred = svd.test(test_set)
            results = (rmse(pred),mae(pred))
            results_dict[f'{ep}, {fac}, {r}'] = results
            
                

RMSE: 0.3270
MAE:  0.2524
RMSE: 0.2928
MAE:  0.2279
RMSE: 0.3287
MAE:  0.2539
RMSE: 0.2940
MAE:  0.2288
RMSE: 0.3308
MAE:  0.2559
RMSE: 0.2954
MAE:  0.2302
RMSE: 0.3784
MAE:  0.2947
RMSE: 0.3364
MAE:  0.2630
RMSE: 0.3783
MAE:  0.2946
RMSE: 0.3362
MAE:  0.2629
RMSE: 0.3784
MAE:  0.2949
RMSE: 0.3362
MAE:  0.2630
RMSE: 0.4264
MAE:  0.3313
RMSE: 0.3765
MAE:  0.2938
RMSE: 0.4252
MAE:  0.3304
RMSE: 0.3760
MAE:  0.2934
RMSE: 0.4241
MAE:  0.3297
RMSE: 0.3755
MAE:  0.2931


In [69]:
results_dict

{'5, 10, 0.2': (0.3269578190331457, 0.25241615261094297),
 '5, 10, 0.4': (0.2928430029616927, 0.22786435957133147),
 '5, 50, 0.2': (0.3286726205563529, 0.2538675540411563),
 '5, 50, 0.4': (0.29395538565663326, 0.22878829147545565),
 '5, 100, 0.2': (0.3307846381929948, 0.255931454529045),
 '5, 100, 0.4': (0.29535708306023684, 0.2301632858806637),
 '10, 10, 0.2': (0.378381735886297, 0.2947035711746248),
 '10, 10, 0.4': (0.33635323524183813, 0.2630333575756857),
 '10, 50, 0.2': (0.37834115451091915, 0.294623275173001),
 '10, 50, 0.4': (0.3362270474109714, 0.26291478907199156),
 '10, 100, 0.2': (0.378389481867313, 0.29485182700236395),
 '10, 100, 0.4': (0.33622484600763614, 0.2630178779833007),
 '20, 10, 0.2': (0.4263643701061662, 0.3312594925439114),
 '20, 10, 0.4': (0.3765214636767856, 0.2937587898857526),
 '20, 50, 0.2': (0.42522469243267247, 0.3304230574783003),
 '20, 50, 0.4': (0.3759873267808544, 0.2933753165251539),
 '20, 100, 0.2': (0.4240777477426761, 0.32972643203871005),
 '20, 1

In [76]:
# svd model
svd = SVD(n_epochs=5,reg_all=0.4,n_factors=10, random_state=0)
svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9b8f2af2b0>

In [79]:
def get_top_n(pred, n=10):
    #Return the top-N recommendation for each user from a set of predictions.
    
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in pred:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [80]:
# predictions
pred = svd.test(test_set)

In [81]:
top_n = get_top_n(pred, n=5)

In [82]:
# metrics
results = (rmse(pred),mae(pred))

RMSE: 0.2928
MAE:  0.2279


In [None]:
image=[0,0,0,0,0]
img_source=[0,0,0,0,0]
#make new figure with 2 subfigures
#each subfigure can have an image in it
fig = plt.figure(figsize=(30,65))

user = 5;

for i in range(5):
    
    k = movies[movies['movieId']==top_n[user][i][0]].index
    image[i] = plt.subplot(1,5,i+1, title = movies['title'][k].tolist()[0])
    img_source[i] = mpimg.imread('image/' + str(top_n[user][i][0])+'.jpg')
    _ = image[i].imshow(img_source[i])
    
    image[i].axis("off")

plt.show()

In [100]:
for i in range(5):
    
    k = movies[movies['movieId']==top_n[user][i][0]].index
    print(movies['genres'][k].tolist()[0])
    

Drama|Western
Crime|Drama
Action|Adventure|Drama|War
Crime|Horror|Thriller
Drama


In [53]:
# mostrar que a base está balanceada

# em relação ao número de avaliações por filme
mv = movielens_dataframe.drop(['title','genres','userId'], axis=1)
# "id do filme" vs número de avaliações
mv = mv.groupby(['movieId']).count()
mv.tail()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
193581,1
193583,1
193585,1
193587,1
193609,1


In [59]:
mv_median = mv.median().tolist()[0]
print(f"Mediana: {mv_median}")

Mediana: 3.0


In [61]:
# em relação ao número de filmes avaliados para cada nota 
rt = movielens_dataframe.drop(['title','genres','userId'], axis=1)
# notas vs número de vezes que aparece
rt = rt.groupby(['rating']).count()
rt

Unnamed: 0_level_0,movieId
rating,Unnamed: 1_level_1
0.5,1370
1.0,2811
1.5,1791
2.0,7551
2.5,5550
3.0,20047
3.5,13136
4.0,26818
4.5,8551
5.0,13211


In [99]:
movielens_dataframe[(movielens_dataframe['userId']==5) & (movielens_dataframe['rating']>3.5)]

Unnamed: 0,userId,movieId,rating,title,genres
516,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
517,5,21,4.0,Get Shorty (1995),Comedy|Crime|Thriller
518,5,34,4.0,Babe (1995),Children|Drama
519,5,36,4.0,Dead Man Walking (1995),Crime|Drama
521,5,50,4.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
522,5,58,5.0,"Postman, The (Postino, Il) (1994)",Comedy|Drama|Romance
523,5,110,4.0,Braveheart (1995),Action|Drama|War
526,5,232,4.0,Eat Drink Man Woman (Yin shi nan nu) (1994),Comedy|Drama|Romance
527,5,247,5.0,Heavenly Creatures (1994),Crime|Drama
529,5,261,4.0,Little Women (1994),Drama


Usuario 5:

generos dos filmes recomendados:

Drama|Western
Crime|Drama
Action|Adventure|Drama|War
Crime|Horror|Thriller
Drama


filmes com notas >= 4:

14/20 filmes de drama
7/20 Crime
5/20 thriller
3/20 crime/thriller