# Model based collaborative system

In [2]:
# import core modules
import pandas as pd 
import numpy as np
from surprise import Reader, Dataset
from surprise import SVD
from surprise.accuracy import rmse, mae
from collections import defaultdict
import matplotlib.pyplot as plt

import matplotlib.image as mpimg
from IPython.display import display, Image

# import warnings
# warnings.filterwarnings('ignore')

In [3]:
# rating dataset
ratings = pd.read_csv ('data_movie/ratings.csv')

# movies dataset
movies = pd.read_csv('data_movie/movies.csv')

## Pre-processamento

In [4]:
dict_movies = {}
for index, row in movies.iterrows():
    dict_movies[row['movieId']] = (row['title'], row['genres'])

In [5]:
# adding corresponded data columns
new_cols = np.zeros((ratings.shape[0] , 2)).astype('object')

for index, row in ratings.iterrows():
    new_cols[index][0] = dict_movies[int(row['movieId'])][0]
    new_cols[index][1] = dict_movies[int(row['movieId'])][1]

In [6]:
# final dataframe
aux = np.append(ratings.to_numpy(),new_cols, axis=1)
movielens_dataframe = pd.DataFrame(aux, columns = ['userId', 'movieId', 'rating', 'timestamp','title', 'genres'])
movielens_dataframe.drop(['timestamp'], axis=1, inplace=True)

In [7]:
movielens_dataframe['userId'] = movielens_dataframe['userId'].astype('int32')
movielens_dataframe['movieId'] = movielens_dataframe['movieId'].astype('int64')
movielens_dataframe['rating'] = movielens_dataframe['rating'].astype('float32')

In [8]:
movielens_dataframe.head(15)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
5,1,70,3.0,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller
6,1,101,5.0,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance
7,1,110,4.0,Braveheart (1995),Action|Drama|War
8,1,151,5.0,Rob Roy (1995),Action|Drama|Romance|War
9,1,157,5.0,Canadian Bacon (1995),Comedy|War


In [9]:
# matriz de ratings-users
movie_matrix = movielens_dataframe.pivot_table(index='userId', columns='title', values='rating')
movie_matrix.sample(20)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
290,,,,,,,,,,,...,,,,,,,,,,
567,,,,,,,,,,,...,,,,,,,,,,
75,,,,,,,,,,,...,,,,,,,,,,
610,4.0,,,,,,,,3.5,,...,,4.0,3.5,3.0,,,2.0,1.5,,
288,,,,,,,3.0,,,,...,,,,,,,,,,
111,,,,,,,,,,,...,,,,,,,,,,
546,,,,,,,,,,,...,,,,,,,,,,
402,,,,,,,,,,,...,,,,,,,,,,
215,,,,,,,,,,,...,,,,,,,,,2.5,
142,,,,,,,,,,,...,,,,,,,,,,


### Distribuição dos Dados

In [10]:
# number of ratings per user
counts = movielens_dataframe['userId'].value_counts()
counts_median = counts.median()
counts.sample(20)

375     33
220    207
338     39
410    167
484    275
299     23
367    185
359     74
422     93
269     29
390     81
186    226
420    141
212    248
47     140
553     83
445     77
554     89
43     114
58     112
Name: userId, dtype: int64

In [11]:
print("Mediana: "+ str(counts_median))

Mediana: 70.5


## Treinamento e Teste

**SVD** é uma técnica de fatoração de matriz que geralmente é usada para reduzir o número de recursos de um conjunto de dados, reduzindo as dimensões do espaço de **N** a **K** onde **K < N**. No entanto, para a finalidade dos sistemas de recomendação, estamos interessados apenas na fatoração de matriz parte mantendo a mesma dimensionalidade. A fatoração da matriz é feita na matriz de classificações de itens do usuário. De um alto nível, a fatoração da matriz pode ser considerada como encontrar duas matrizes cujo produto é a matriz original.

In [12]:
# surprise dataset
reader = Reader()
dataset = Dataset.load_from_df(movielens_dataframe[['userId', 'movieId', 'rating']], reader)

In [13]:
# train set
train_set = dataset.build_full_trainset()

In [14]:
# test set
test_set = train_set.build_anti_testset()

In [15]:
'''results_dict = {}
epochs = [5,10,20]
factors = [10,50,100]
reg = [0.2,0.4]

for ep in epochs:
    for fac in factors:
        for r in reg:
            svd = SVD(n_epochs=ep,reg_all=r,n_factors=fac, random_state=0).fit(train_set)
            pred = svd.test(test_set)
            results = (rmse(pred),mae(pred))
            results_dict[f'{ep}, {fac}, {r}'] = results
            
'''                

"results_dict = {}\nepochs = [5,10,20]\nfactors = [10,50,100]\nreg = [0.2,0.4]\n\nfor ep in epochs:\n    for fac in factors:\n        for r in reg:\n            svd = SVD(n_epochs=ep,reg_all=r,n_factors=fac, random_state=0).fit(train_set)\n            pred = svd.test(test_set)\n            results = (rmse(pred),mae(pred))\n            results_dict[f'{ep}, {fac}, {r}'] = results\n            \n"

In [16]:
results_dict

NameError: name 'results_dict' is not defined

In [None]:
# svd model
svd = SVD(n_epochs=5,reg_all=0.4,n_factors=10, random_state=0)
svd.fit(train_set)

In [None]:
def get_top_n(pred, n=10):
    #Return the top-N recommendation for each user from a set of predictions.
    
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in pred:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
# predictions
pred = svd.test(test_set)

In [None]:
top_n = get_top_n(pred, n=5)

In [None]:
# metrics
results = (rmse(pred),mae(pred))

In [None]:
image=[0,0,0,0,0]
img_source=[0,0,0,0,0]
#make new figure with 2 subfigures
#each subfigure can have an image in it
fig = plt.figure(figsize=(30,65))

user = 8;

for i in range(5):
    
    k = movies[movies['movieId']==top_n[user][i][0]].index
    image[i] = plt.subplot(1,5,i+1, title = movies['title'][k].tolist()[0])
    img_source[i] = mpimg.imread('image/' + str(top_n[user][i][0])+'.jpg')
    _ = image[i].imshow(img_source[i])
    
    image[i].axis("off")

plt.show()

In [None]:
for i in range(5):
    
    k = movies[movies['movieId']==top_n[user][i][0]].index
    print(movies['genres'][k].tolist()[0])
    

In [None]:
movielens_dataframe[(movielens_dataframe['userId']==5) & (movielens_dataframe['rating']>3.5)]

In [None]:
# mostrar que a base está balanceada

# em relação ao número de avaliações por filme
mv = movielens_dataframe.drop(['title','genres','userId'], axis=1)
# "id do filme" vs número de avaliações
mv = mv.groupby(['movieId']).count()
mv.tail()

In [None]:
mv_median = mv.median().tolist()[0]
print(f"Mediana: {mv_median}")

In [None]:
# em relação ao número de filmes avaliados para cada nota 
rt = movielens_dataframe.drop(['title','genres','userId'], axis=1)
# notas vs número de vezes que aparece
rt = rt.groupby(['rating']).count()
rt

Usuario 5:

generos dos filmes recomendados:

Drama|Western
Crime|Drama
Action|Adventure|Drama|War
Crime|Horror|Thriller
Drama


filmes com notas >= 4:

14/20 filmes de drama
7/20 Crime
5/20 thriller
3/20 crime/thriller

In [None]:
Comedy|War
Action|Crime|Drama|Thriller
Crime|Drama
Mystery|Thriller
Crime|Drama

13/20