# Sistema de Recomendação

In [21]:
# importação das bibliotecas necessárias:

import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
# leitura dos bancos de dados ratings e movies:

ratings = pd.read_csv("data/ratings.csv")
movies = pd.read_csv("data/movies.csv")

# juntando os bancos de dados ratings e movies e removendo as colunas genres e timestamp,
# pois não são necessárias as mesmas:

new_bd = pd.merge(ratings,movies).drop(['genres','timestamp'],axis=1)
new_bd
new_bd.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [15]:
# pivotamento de matriz
user_movie = new_bd.pivot_table(index=['userId'], columns=['title'],values='rating')
user_movie.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [17]:
# como há muitos filmes com menos de 20 avaliações e pra evitar criar ruido no sistema, 
# foi-se necessário remover os mesmos para obter um BD mais conciso e que cada usuario tenha no 
# minimo avaliado 20 filmes. Como há NaN(not a number) foi adotado preencher esses dados com zero

user_movie = user_movie.dropna(thresh=20, axis=1).fillna(0)
user_movie.head()



title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# para obter um resultado satisfatório, visto que é interessante estar realizando a normalização
# das avaliações, no caso, normalizando entre [-1,1], a partir da função abaixo:

def standardize(avaliacao):
    avaliacao_normalizada = (avaliacao - avaliacao.mean()) / (avaliacao.max() - avaliacao.min())
    return avaliacao_normalizada

user_movie_std = user_movie.apply(standardize)
#head() - para mostrar apenas as 5 primeiras linhas
user_movie_std.head()

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.050492,-0.062459,-0.047377,-0.049508,-0.077541,-0.021967,0.861384,-0.025902,-0.13918,-0.022541,...,0.909672,-0.02541,-0.024262,-0.04623,-0.067377,-0.062131,-0.04082,-0.027869,-0.027254,0.773279
2,-0.050492,-0.062459,-0.047377,-0.049508,-0.077541,-0.021967,-0.027505,-0.025902,-0.13918,-0.022541,...,-0.090328,-0.02541,-0.024262,-0.04623,0.532623,-0.062131,-0.04082,-0.027869,-0.027254,-0.026721
3,-0.050492,-0.062459,-0.047377,-0.049508,-0.077541,-0.021967,-0.027505,-0.025902,-0.13918,0.102459,...,-0.090328,-0.02541,-0.024262,-0.04623,-0.067377,-0.062131,-0.04082,-0.027869,-0.027254,-0.026721
4,-0.050492,-0.062459,-0.047377,-0.049508,0.922459,-0.021967,-0.027505,-0.025902,-0.13918,-0.022541,...,-0.090328,-0.02541,-0.024262,-0.04623,-0.067377,-0.062131,-0.04082,-0.027869,-0.027254,-0.026721
5,-0.050492,-0.062459,-0.047377,-0.049508,-0.077541,-0.021967,-0.027505,-0.025902,-0.13918,-0.022541,...,-0.090328,-0.02541,-0.024262,-0.04623,-0.067377,-0.062131,-0.04082,-0.027869,-0.027254,-0.026721


### Utilizando a similaridade do Cosseno como solicitado:

In [23]:
# estamos fazendo uma transposição, pois queremos similaridade entre os itens que estão em linhas, 
# não em colunas,no caso, similaridade do usuario em relação aos filmes. Em teoria, 
# necessário a transposta para Filtros Colaborativos

matriz_simi_cos= cosine_similarity(user_movie_std.T)
print(matriz_simi_cos)

[[1.         0.27398929 0.14890261 ... 0.0536135  0.24109183 0.12590528]
 [0.27398929 1.         0.22348095 ... 0.12102886 0.13081258 0.11061178]
 [0.14890261 0.22348095 1.         ... 0.04780377 0.15693172 0.07873442]
 ...
 [0.0536135  0.12102886 0.04780377 ... 1.         0.16302184 0.13861129]
 [0.24109183 0.13081258 0.15693172 ... 0.16302184 1.         0.06567341]
 [0.12590528 0.11061178 0.07873442 ... 0.13861129 0.06567341 1.        ]]


In [24]:
# a patir da transposta, criamos um BD com os itens, no caso, filmes, já similarizados

matriz_simi_cos = pd.DataFrame(matriz_simi_cos, columns=user_movie_std.columns, index=user_movie_std.columns)
matriz_simi_cos.head()

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),1.0,0.273989,0.148903,0.142141,0.159756,0.297152,0.072835,0.226574,0.113616,0.274272,...,0.066077,0.073476,0.374515,0.414585,0.355723,0.252226,0.216007,0.053614,0.241092,0.125905
10 Things I Hate About You (1999),0.273989,1.0,0.223481,0.211473,0.011784,0.321071,0.215828,0.06947,0.085974,0.064187,...,0.144038,0.152333,0.243118,0.091853,0.158637,0.281934,0.050031,0.121029,0.130813,0.110612
101 Dalmatians (1996),0.148903,0.223481,1.0,0.285112,0.119843,0.188467,0.004213,0.159777,0.110844,0.090231,...,0.177214,0.033582,0.114968,0.067134,0.113224,0.184324,0.054024,0.047804,0.156932,0.078734
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.142141,0.211473,0.285112,1.0,0.134037,0.218406,0.135894,0.227193,0.10223,0.112334,...,0.180318,0.143006,0.120302,0.08365,0.171654,0.27426,0.077594,0.085606,0.24882,0.171118
12 Angry Men (1957),0.159756,0.011784,0.119843,0.134037,1.0,-0.027672,0.08476,0.189497,0.195909,0.236037,...,0.135876,0.139655,0.104518,0.241435,0.144652,0.122107,0.056742,-0.001708,0.074306,0.102744


In [26]:
# necessrio criar uma função para estar realizando o calculo da similaridade
# posteriormente, iremos fazer a recomendação com o nome do filme e uma determinada avaliação

def get_simi_movies_by_cos(movie_name, user_rating):
    
    similar_score = matriz_simi_cos[movie_name]*(user_rating)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

movie_name= '12 Angry Men (1957)'
user_rating=5
print(get_simi_movies_by_cos(movie_name,user_rating))

title
12 Angry Men (1957)                              5.000000
It's a Wonderful Life (1946)                     1.608173
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)    1.598578
North by Northwest (1959)                        1.595207
One Flew Over the Cuckoo's Nest (1975)           1.579666
                                                   ...   
River Wild, The (1994)                          -0.335837
Cliffhanger (1993)                              -0.365311
Specialist, The (1994)                          -0.381320
Disclosure (1994)                               -0.385016
Outbreak (1995)                                 -0.425251
Name: 12 Angry Men (1957), Length: 1297, dtype: float64


### Utilizando a correlação de Pearson como solicitado:

In [27]:
tabela_simi_pear=user_movie.corr(method='pearson')
tabela_simi_pear.head()

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),1.0,0.273989,0.148903,0.142141,0.159756,0.297152,0.072835,0.226574,0.113616,0.274272,...,0.066077,0.073476,0.374515,0.414585,0.355723,0.252226,0.216007,0.053614,0.241092,0.125905
10 Things I Hate About You (1999),0.273989,1.0,0.223481,0.211473,0.011784,0.321071,0.215828,0.06947,0.085974,0.064187,...,0.144038,0.152333,0.243118,0.091853,0.158637,0.281934,0.050031,0.121029,0.130813,0.110612
101 Dalmatians (1996),0.148903,0.223481,1.0,0.285112,0.119843,0.188467,0.004213,0.159777,0.110844,0.090231,...,0.177214,0.033582,0.114968,0.067134,0.113224,0.184324,0.054024,0.047804,0.156932,0.078734
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.142141,0.211473,0.285112,1.0,0.134037,0.218406,0.135894,0.227193,0.10223,0.112334,...,0.180318,0.143006,0.120302,0.08365,0.171654,0.27426,0.077594,0.085606,0.24882,0.171118
12 Angry Men (1957),0.159756,0.011784,0.119843,0.134037,1.0,-0.027672,0.08476,0.189497,0.195909,0.236037,...,0.135876,0.139655,0.104518,0.241435,0.144652,0.122107,0.056742,-0.001708,0.074306,0.102744


In [32]:
# de maneira análoga ao cosseno, utilizaremos a função para calcular a similaridade
# e posteriormente, fazermos a recomendação

def get_simi_movies_by_pearson(movie_name, user_rating):
    
    similar_score = tabela_simi_pear[movie_name]*(user_rating)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

movie_name= 'xXx (2002)'
user_rating=5
print(get_simi_movies_by_pearson(movie_name,user_rating))

title
xXx (2002)                                                    5.000000
Die Another Day (2002)                                        3.315050
Charlie's Angels: Full Throttle (2003)                        2.784715
Evolution (2001)                                              2.768341
Resident Evil (2002)                                          2.659780
                                                                ...   
Arsenic and Old Lace (1944)                                  -0.207612
Philadelphia Story, The (1940)                               -0.209988
Madness of King George, The (1994)                           -0.211767
Like Water for Chocolate (Como agua para chocolate) (1992)   -0.220390
Postman, The (Postino, Il) (1994)                            -0.236503
Name: xXx (2002), Length: 1297, dtype: float64
