In [1]:
#Importando as bibliotecas

In [2]:
import numpy as np
import pandas as pd

In [3]:
#importando o dataset de filmes

In [4]:
filmes = pd.read_csv('movies_metadata.csv', low_memory=False)
print(filmes.head(3))

   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                   NaN   8844  tt0113497                en   
2                                   NaN  15602  tt0113228                en   

     original_title                                           overview  ...  \
0         Toy Story  Led by Woody, Andy's toys live happily in his ...  ...   
1      

In [5]:
#Separando e renomeando as colunas

In [6]:
filmes = filmes[['id','original_title','original_language','vote_count']]
print(filmes.head(3))

      id    original_title original_language  vote_count
0    862         Toy Story                en      5415.0
1   8844           Jumanji                en      2413.0
2  15602  Grumpier Old Men                en        92.0


In [7]:
#Verificando valores nulos

In [8]:
print(filmes.isnull().sum())

id                    0
original_title        0
original_language    11
vote_count            6
dtype: int64


In [9]:
#Como possuem poucos valores nulos, optei por dropar as linhas correspondentes

In [10]:
filmes.dropna(inplace=True)
print(filmes.isnull().sum())

id                   0
original_title       0
original_language    0
vote_count           0
dtype: int64


In [11]:
#Importando o dataset de avaliações

In [12]:
avaliacao = pd.read_csv('ratings.csv')
print(avaliacao.head(3))
print(avaliacao.columns)

   userId  movieId  rating   timestamp
0       1      110     1.0  1425941529
1       1      147     4.5  1425942435
2       1      858     5.0  1425941523
Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')


In [13]:
#Separando as colunas que serão utilizadas

In [14]:
avaliacao = avaliacao[['userId','movieId','rating']]
print(avaliacao.head(3))

   userId  movieId  rating
0       1      110     1.0
1       1      147     4.5
2       1      858     5.0


In [15]:
#Verificando valores nulos

In [16]:
print(avaliacao.isnull().sum())

userId     0
movieId    0
rating     0
dtype: int64


In [17]:
#Verificando a média de avaliações por usuário

In [18]:
print(avaliacao['userId'].value_counts().mean())

96.06745393065974


In [19]:
#Separando Usuários com pelo menos 100 avaliações

In [20]:
qt_avaliacoes = avaliacao['userId'].value_counts() > 99
y = qt_avaliacoes[qt_avaliacoes].index

In [21]:
#Visualizando a quantidade de indices

In [22]:
print(avaliacao.shape)
print(y.shape)

(26024289, 3)
(64433,)


In [23]:
#Seperando os usuários que pertencem a y

In [24]:
avaliacao = avaliacao[avaliacao['userId'].isin(y)]

In [25]:
#Verificando o tamanho do dataset

In [26]:
print(avaliacao.shape)

(20112251, 3)


In [27]:
#Separando filmes com pelo menos 1000 avaliações

In [28]:
filmes = filmes[filmes['vote_count'] > 999]
print(filmes.head(3))
print(filmes.shape)

     id original_title original_language  vote_count
0   862      Toy Story                en      5415.0
1  8844        Jumanji                en      2413.0
5   949           Heat                en      1886.0
(1121, 4)


In [29]:
#Verificando a quantidade de filmes por linguagem

In [30]:
filmes_linguagem = filmes['original_language'].value_counts()
print(filmes_linguagem.head(20))

original_language
en    1100
fr       5
ja       5
it       3
ko       2
pt       1
de       1
es       1
cn       1
sv       1
id       1
Name: count, dtype: int64


In [31]:
#Selecionando apenas filmes em inglês

In [32]:
filmes = filmes[filmes['original_language'] == 'en']
print(filmes.head(20))

        id                  original_title original_language  vote_count
0      862                       Toy Story                en      5415.0
1     8844                         Jumanji                en      2413.0
5      949                            Heat                en      1886.0
9      710                       GoldenEye                en      1194.0
15     524                          Casino                en      1343.0
18    9273  Ace Ventura: When Nature Calls                en      1128.0
31      63                  Twelve Monkeys                en      2470.0
46     807                           Se7en                en      5915.0
47   10530                      Pocahontas                en      1509.0
49     629              The Usual Suspects                en      3334.0
69     755             From Dusk Till Dawn                en      1644.0
108    197                      Braveheart                en      3404.0
109    103                     Taxi Driver         

In [33]:
#Subistituindo o nome da coluna id para possuir o mesmo nome que em avaliações

In [34]:
filmes = filmes.rename(columns={'id':'movieId'})

In [35]:
#Verificando informações acerca dos datasets

In [36]:
print(filmes.info())
print(avaliacao.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1100 entries, 0 to 44842
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movieId            1100 non-null   object 
 1   original_title     1100 non-null   object 
 2   original_language  1100 non-null   object 
 3   vote_count         1100 non-null   float64
dtypes: float64(1), object(3)
memory usage: 43.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 20112251 entries, 204 to 26024288
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 613.8 MB
None


In [37]:
#Modificando o tipo do id para int

In [38]:
filmes['movieId'] = filmes['movieId'].astype(int)

In [39]:
#Fundindo os datasets

In [40]:
avaliacoes_e_filmes = avaliacao.merge(filmes, on='movieId')

In [41]:
#Verificando informações do novo dataframe

In [42]:
print(avaliacoes_e_filmes.head(20))
print(avaliacoes_e_filmes.shape)
print(avaliacoes_e_filmes.isnull().sum())
print(avaliacoes_e_filmes.columns)

    userId  movieId  rating                      original_title  \
0        8      170     3.0                       28 Days Later   
1        8      585     2.0                      Monsters, Inc.   
2        8     1265     3.0                Bridge to Terabithia   
3        8     2300     2.0                           Space Jam   
4        8     4638     3.0                            Hot Fuzz   
5       11      165     3.5          Back to the Future Part II   
6       11      296     4.0  Terminator 3: Rise of the Machines   
7       11      364     3.0                      Batman Returns   
8       11      500     4.0                      Reservoir Dogs   
9       11     1265     2.5                Bridge to Terabithia   
10      11     1573     3.5                          Die Hard 2   
11      11    49530     3.5                             In Time   
12      11    51540     4.0                     Horrible Bosses   
13      12       73     2.0                  American History 

In [43]:
#Retirando campos com duplicidade

In [44]:
avaliacoes_e_filmes.drop_duplicates(['userId', 'movieId'],inplace=True)
avaliacoes_e_filmes.drop_duplicates(['userId', 'original_title'],inplace=True)
print(avaliacoes_e_filmes.shape)

(1418991, 6)


In [45]:
#Excluindo a coluna movieId

In [46]:
del avaliacoes_e_filmes['movieId']

In [47]:
#Criando um PIVOT para transformar usuarios em coluna

In [48]:
filmes_pivot = avaliacoes_e_filmes.pivot(index='original_title', columns='userId', values='rating')
print(filmes_pivot.head(20))

userId                        8       11      12      15      16      20      \
original_title                                                                 
10 Things I Hate About You       NaN     NaN     NaN     NaN     NaN     NaN   
12 Angry Men                     NaN     NaN     NaN     NaN     NaN     NaN   
127 Hours                        NaN     NaN     NaN     NaN     NaN     NaN   
1408                             NaN     NaN     NaN     NaN     NaN     NaN   
2 Fast 2 Furious                 NaN     NaN     NaN     NaN     NaN     NaN   
2001: A Space Odyssey            NaN     NaN     NaN     NaN     NaN     NaN   
27 Dresses                       NaN     NaN     NaN     NaN     NaN     NaN   
28 Days Later                    3.0     NaN     NaN     NaN     NaN     NaN   
28 Weeks Later                   NaN     NaN     NaN     NaN     NaN     NaN   
300                              NaN     NaN     NaN     NaN     NaN     NaN   
3:10 to Yuma                     NaN    

In [49]:
#Preenchendo os valores nulos com zero

In [50]:
filmes_pivot.fillna(0, inplace=True)
print(filmes_pivot.head(20))

userId                        8       11      12      15      16      20      \
original_title                                                                 
10 Things I Hate About You       0.0     0.0     0.0     0.0     0.0     0.0   
12 Angry Men                     0.0     0.0     0.0     0.0     0.0     0.0   
127 Hours                        0.0     0.0     0.0     0.0     0.0     0.0   
1408                             0.0     0.0     0.0     0.0     0.0     0.0   
2 Fast 2 Furious                 0.0     0.0     0.0     0.0     0.0     0.0   
2001: A Space Odyssey            0.0     0.0     0.0     0.0     0.0     0.0   
27 Dresses                       0.0     0.0     0.0     0.0     0.0     0.0   
28 Days Later                    3.0     0.0     0.0     0.0     0.0     0.0   
28 Weeks Later                   0.0     0.0     0.0     0.0     0.0     0.0   
300                              0.0     0.0     0.0     0.0     0.0     0.0   
3:10 to Yuma                     0.0    

In [51]:
#Importando csr_matrix do SciPy

In [52]:
from scipy.sparse import csr_matrix

In [53]:
#Transformando o dataset em uma matriz sparsa

In [54]:
filmes_sparse = csr_matrix(filmes_pivot)

In [55]:
#Importando KNN do SciKit

In [56]:
from sklearn.neighbors import NearestNeighbors

In [57]:
#Treinando o Modelo

In [58]:
model = NearestNeighbors(algorithm = 'brute')
model.fit(filmes_sparse)

In [59]:
#Testando o modelo utilizando filmes

In [60]:
distances, sugestions = model.kneighbors(filmes_pivot.filter(items = ['300'], axis = 0).values.reshape(1, -1))
for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]],'\n')


Index(['300', 'Liar Liar', 'Ice Age', 'Snow White and the Seven Dwarfs',
       'Ratatouille'],
      dtype='object', name='original_title') 



In [61]:
#Antz
distances, sugestions = model.kneighbors(filmes_pivot.filter(items = ['Antz'], axis = 0).values.reshape(1, -1))
for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]],'\n')

Index(['Antz', 'Dumb and Dumber To', 'Inception', 'Bambi',
       'The Angry Birds Movie'],
      dtype='object', name='original_title') 



In [62]:
#Alien
distances, sugestions = model.kneighbors(filmes_pivot.filter(items = ['Alien'], axis = 0).values.reshape(1, -1))
for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]],'\n')

Index(['Alien', 'Being John Malkovich', 'The Shawshank Redemption',
       'Jurassic Park III', 'Pirates of the Caribbean: On Stranger Tides'],
      dtype='object', name='original_title') 



In [63]:
#Casper
distances, sugestions = model.kneighbors(filmes_pivot.filter(items = ['Casper'], axis = 0).values.reshape(1, -1))
for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]],'\n')

Index(['Casper', 'Snitch', 'RED 2', 'Teenage Mutant Ninja Turtles',
       'The Expendables 2'],
      dtype='object', name='original_title') 



In [64]:
#Toy Story
distances, sugestions = model.kneighbors(filmes_pivot.filter(items = ['Toy Story'], axis = 0).values.reshape(1, -1))
for i in range(len(sugestions)):
    print(filmes_pivot.index[sugestions[i]],'\n')

Index(['Toy Story', 'Bambi', 'RED 2', 'Teenage Mutant Ninja Turtles',
       'The Hunger Games: Mockingjay - Part 2'],
      dtype='object', name='original_title') 

