# Criando um sistema de recomendação para filmes

In [1]:
#Bibliotecas
import pandas as pd
import numpy as np

In [15]:
#Utilizaremos 2 datasets neste projeto!
filmes = pd.read_csv('movies_metadata.csv', low_memory= False)#'LOW_MEMORY= FALSE' Não exige que classifique TODAS as variaveis, optimizando o tempo
filmes.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [16]:
avaliacoes = pd.read_csv('ratings.csv')
avaliacoes.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523


# Com os datasets já importados, iniciaremos o processamento dos dados

In [18]:
#Vamos filtrar deixando apenas as colunas que serão utilizadas 
filmes = filmes[['id','original_title','original_language','vote_count']]

#Renomear as colunas
filmes.rename(columns={'id':'ID_FILME','original_title':'TITULO','original_language':'IDIOMA','vote_count':'QTD_AVALIACOES'},inplace=True)

filmes.head()

Unnamed: 0,ID_FILME,TITULO,IDIOMA,QTD_AVALIACOES
0,862,Toy Story,en,5415.0
1,8844,Jumanji,en,2413.0
2,15602,Grumpier Old Men,en,92.0
3,31357,Waiting to Exhale,en,34.0
4,11862,Father of the Bride Part II,en,173.0


In [21]:
#Também filtraremos no dataframe de avaliações
avaliacoes = avaliacoes[['userId','movieId','rating']]

#Renomear as colunas
avaliacoes.rename(columns={'userId':'ID_USUARIO','movieId':'ID_FILME','rating':'AVALIACAO'},inplace=True)

avaliacoes.head()

Unnamed: 0,ID_USUARIO,ID_FILME,AVALIACAO
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [22]:
#Verificar se há valores nulos
filmes.isna().sum()

ID_FILME           0
TITULO             0
IDIOMA            11
QTD_AVALIACOES     6
dtype: int64

In [24]:
#Excluindo valores nulos
filmes.dropna(inplace=True)

In [25]:
#verificando se há valores nulos no dataframe de avaliações
avaliacoes.isna().sum()
#como não há valores nulos, não é necessário excluir

ID_USUARIO    0
ID_FILME      0
AVALIACAO     0
dtype: int64

In [26]:
#Verificando a quantidade de avaliações por usuários
avaliacoes['ID_USUARIO'].value_counts()

45811     18276
8659       9279
270123     7638
179792     7515
228291     7410
          ...  
30155         1
9641          1
164717        1
243426        1
234625        1
Name: ID_USUARIO, Length: 270896, dtype: int64

In [29]:
#Para este estudo, utilizaremos apenas os usuários que fizeram pelo menos 1000 avaliações para melhor precisão
qt_avaliacoes = avaliacoes['ID_USUARIO'].value_counts() > 999
y = qt_avaliacoes[qt_avaliacoes].index
y.shape

(2509,)

In [30]:
#Criada a variavel de apenas usuários com pelo menos 1000 avaliações, vamos implementá-la ao dataframe
avaliacoes = avaliacoes[avaliacoes['ID_USUARIO'].isin(y)]

In [32]:
#RESTARAM 3.844.582 USUÁRIOS
avaliacoes.shape

(3844582, 3)

In [33]:
#DA MESMA FORMAS PODEMOS FAZER PARA OS FILMES
filmes = filmes[filmes['QTD_AVALIACOES'] > 999]

In [34]:
#Nos restaram 1121 filmes
filmes.shape

(1121, 4)

In [37]:
#Vamos tratar a variavel de IDIOMA agora.
filmes_linguagem = filmes['IDIOMA'].value_counts()

In [42]:
filmes_linguagem

en    1100
fr       5
ja       5
it       3
ko       2
pt       1
de       1
es       1
cn       1
sv       1
id       1
Name: IDIOMA, dtype: int64

In [43]:
#Como há praticamente só filmes em inglês, vamos eliminar os outros idiomas
filmes = filmes[filmes['IDIOMA']=='en']

In [44]:
#VISUALIZANDO OS TIPOS DE DADOS DOS DATAFRAMES
filmes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1100 entries, 0 to 44842
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID_FILME        1100 non-null   object 
 1   TITULO          1100 non-null   object 
 2   IDIOMA          1100 non-null   object 
 3   QTD_AVALIACOES  1100 non-null   float64
dtypes: float64(1), object(3)
memory usage: 43.0+ KB


In [46]:
avaliacoes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3844582 entries, 17291 to 26023521
Data columns (total 3 columns):
 #   Column      Dtype  
---  ------      -----  
 0   ID_USUARIO  int64  
 1   ID_FILME    int64  
 2   AVALIACAO   float64
dtypes: float64(1), int64(2)
memory usage: 117.3 MB


In [47]:
#Podemos observar que há uma divergência entre os tipos de dados da variável 'ID_FILME' nos dataframes, vamos corrigir.
filmes['ID_FILME'] = filmes['ID_FILME'].astype(int)


In [48]:
#Podemos ver que a quantidade não foi alterada 
filmes.shape

(1100, 4)

In [51]:
#Podemos confirmar a alteração com sucesso.
filmes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1100 entries, 0 to 44842
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID_FILME        1100 non-null   int32  
 1   TITULO          1100 non-null   object 
 2   IDIOMA          1100 non-null   object 
 3   QTD_AVALIACOES  1100 non-null   float64
dtypes: float64(1), int32(1), object(2)
memory usage: 38.7+ KB


In [52]:
#Agora vamos concatenar os dataframes
avaliacoes_filmes = avaliacoes.merge(filmes, on='ID_FILME')
avaliacoes_filmes

Unnamed: 0,ID_USUARIO,ID_FILME,AVALIACAO,TITULO,IDIOMA,QTD_AVALIACOES
0,229,12,1.0,Finding Nemo,en,6292.0
1,741,12,3.0,Finding Nemo,en,6292.0
2,1932,12,0.5,Finding Nemo,en,6292.0
3,3437,12,2.0,Finding Nemo,en,6292.0
4,3694,12,1.5,Finding Nemo,en,6292.0
...,...,...,...,...,...,...
189877,243443,8909,1.0,Wanted,en,2583.0
189878,218368,101299,4.0,The Hunger Games: Catching Fire,en,6656.0
189879,219677,146216,1.0,RED 2,en,1548.0
189880,221327,118340,4.0,Guardians of the Galaxy,en,10014.0


In [53]:
avaliacoes_filmes.shape

(189882, 6)

In [54]:
avaliacoes_filmes.isna().sum()

ID_USUARIO        0
ID_FILME          0
AVALIACAO         0
TITULO            0
IDIOMA            0
QTD_AVALIACOES    0
dtype: int64

In [55]:
#Vamos eliminar possiveis valores duplicados, para caso ocorra de um único usuário tenha avaliado o mesmo filme várias vezes
avaliacoes_filmes.drop_duplicates(['ID_USUARIO','ID_FILME'],inplace=True)

In [56]:
#Não havia nenhuma
avaliacoes_filmes.shape

(189882, 6)

In [57]:
#Vamos remover a coluna de ID FILMES pois não será utilizadaa
del avaliacoes_filmes['ID_FILME']

In [58]:
#Coluna removida com sucesso!
avaliacoes_filmes.head()

Unnamed: 0,ID_USUARIO,AVALIACAO,TITULO,IDIOMA,QTD_AVALIACOES
0,229,1.0,Finding Nemo,en,6292.0
1,741,3.0,Finding Nemo,en,6292.0
2,1932,0.5,Finding Nemo,en,6292.0
3,3437,2.0,Finding Nemo,en,6292.0
4,3694,1.5,Finding Nemo,en,6292.0


In [60]:
#Agora precisamos transformar as linhas de usuários em colunas para prosseguir com o sistema
#para isso, utilizaremos o 'pivot'
filmes_pivot = avaliacoes_filmes.pivot_table(columns='ID_USUARIO',index='TITULO',values='AVALIACAO')

filmes_pivot.head()

ID_USUARIO,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
TITULO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,,,,,,,,,,,...,,2.5,,3.0,3.0,,,,,
12 Angry Men,,,,,,,,,,,...,,,,,,,,,3.5,
127 Hours,,,,,,,,,,,...,,,,,,,,,,
1408,,,,,,,,,,,...,,,,,2.5,2.0,,,,
2 Fast 2 Furious,,,,,,,,,,,...,,,,,,,,,,


In [63]:
#Os valores que são nulos, serão preenchidos com 0
filmes_pivot.fillna(0,inplace=True)
filmes_pivot.head()

ID_USUARIO,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
TITULO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.5,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0
127 Hours,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.5,2.0,0.0,0.0,0.0,0.0
2 Fast 2 Furious,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# CRIAÇÃO DE MATIZ

In [64]:
#Compactação da matriz para optimização de tempo, excelente método quando se tem uma matriz com muitos zeros!!
from scipy.sparse import csr_matrix

In [65]:
#Transformando em uma matriz sparse
filmes_sparse = csr_matrix(filmes_pivot)

# UTILIZAÇÃO DO KNN

In [67]:
#IMPORTANDO O ALGORITMO KNN
from sklearn.neighbors import NearestNeighbors

In [69]:
#Criando e treinando o modelo preditivo
modelo = NearestNeighbors(algorithm = 'brute')
modelo.fit(filmes_sparse)

# PREVISÕES

In [71]:
#distaces = armazena distancia entre os objetos vizinhos
#sugestions = Esta variável provavelmente contém os índices ou identificadores dos filmes que são os k vizinhos mais próximos do filme
#modelo.kneighbors = os vizinhos mais próximos, ou seja, as recomendações 
#item = o filme
#values = o valor retornado
#reshape(1,-1) = o formato da matriz retornada em uma unica linha

#FILME 127 Hours
distances,sugestions = modelo.kneighbors(filmes_pivot.filter(items=['127 Hours'],axis=0).values.reshape(1,-1))

for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]])

Index(['127 Hours', 'American Hustle', 'The Expendables 2', 'Lord of War',
       'RED 2'],
      dtype='object', name='TITULO')


In [72]:
#Toy Story
distances,sugestions = modelo.kneighbors(filmes_pivot.filter(items=['Toy Story'],axis=0).values.reshape(1,-1))

for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]])

Index(['Toy Story', 'Meet the Fockers', 'Top Gun',
       'Harry Potter and the Chamber of Secrets',
       'Austin Powers: International Man of Mystery'],
      dtype='object', name='TITULO')


In [73]:
#2 Fast 2 Furious
distances,sugestions = modelo.kneighbors(filmes_pivot.filter(items=['2 Fast 2 Furious'],axis=0).values.reshape(1,-1))

for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]])

Index(['2 Fast 2 Furious', 'Bambi', 'The Matrix Reloaded',
       'Brokeback Mountain', 'Lord of War'],
      dtype='object', name='TITULO')
