## Sistema de Recomendação

### Importando as bibliotecas

In [1]:
import pandas as pd
import numpy as np

## Importando os dados

In [2]:
movies = pd.read_csv('data/movies_metadata.csv', low_memory=False)
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
ratings = pd.read_csv('data/ratings.csv', low_memory=False)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


## Pré processamento dos dados

#### Filtrando somente colunas necessárias

In [4]:
movies = movies[['id', 'original_title', 'original_language', 'vote_count']]
movies.head()

Unnamed: 0,id,original_title,original_language,vote_count
0,862,Toy Story,en,5415.0
1,8844,Jumanji,en,2413.0
2,15602,Grumpier Old Men,en,92.0
3,31357,Waiting to Exhale,en,34.0
4,11862,Father of the Bride Part II,en,173.0


In [5]:
ratings = ratings[['userId', 'movieId', 'rating']]
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


#### Verificando valores nulos

In [6]:
movies.isna().sum()

id                    0
original_title        0
original_language    11
vote_count            6
dtype: int64

In [7]:
movies.dropna(inplace = True)

In [8]:
ratings.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

#### Verificando as avaliações por usuários

In [9]:
ratings['rating'].value_counts()

rating
4.0    6998802
3.0    5256722
5.0    3812499
3.5    3116213
4.5    2170441
2.0    1762440
2.5    1255358
1.0     843310
0.5     404897
1.5     403607
Name: count, dtype: int64

In [10]:
# Considerando apenas usuários que fazem avaliações frequentemente, deixando de fora usuários que fazem pouca avaliações
qtd_ratings = ratings['userId'].value_counts() > 999
y = qtd_ratings[qtd_ratings].index
y.shape

(2509,)

In [11]:
# Vendo os usuários selecionados
y

Index([ 45811,   8659, 270123, 179792, 228291, 243443,  98415, 229879,  98787,
       172224,
       ...
       269212, 257117,  76945,  30733, 196384,  53075, 220764, 214328,  14354,
       182812],
      dtype='int64', name='userId', length=2509)

In [12]:
# Vendo o tamanho do dataset 'ratings'
ratings.shape

(26024289, 3)

In [13]:
ratings = ratings[ratings['userId'].isin(y)]
ratings.shape

(3844582, 3)

In [14]:
ratings.head()

Unnamed: 0,userId,movieId,rating
17291,229,1,3.0
17292,229,2,3.0
17293,229,4,2.0
17294,229,5,1.0
17295,229,7,2.0


In [15]:
movies.head()

Unnamed: 0,id,original_title,original_language,vote_count
0,862,Toy Story,en,5415.0
1,8844,Jumanji,en,2413.0
2,15602,Grumpier Old Men,en,92.0
3,31357,Waiting to Exhale,en,34.0
4,11862,Father of the Bride Part II,en,173.0


In [16]:
movies = movies[movies['vote_count'] > 999]

In [17]:
movies_language = movies['original_language'].value_counts()
movies_language.head()

original_language
en    1100
fr       5
ja       5
it       3
ko       2
Name: count, dtype: int64

In [18]:
# Selecionando filmes somente com a linguagem 'en'
movies = movies[movies['original_language'] == 'en']

In [19]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1100 entries, 0 to 44842
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1100 non-null   object 
 1   original_title     1100 non-null   object 
 2   original_language  1100 non-null   object 
 3   vote_count         1100 non-null   float64
dtypes: float64(1), object(3)
memory usage: 43.0+ KB


In [20]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3844582 entries, 17291 to 26023521
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 117.3 MB


#### Juntando as tabelas

In [21]:
movies['id'] = movies['id'].astype(int)

In [22]:
movies.shape

(1100, 4)

In [23]:
movies_ratings = ratings.merge(movies, left_on='movieId', right_on='id')
movies_ratings.head()

Unnamed: 0,userId,movieId,rating,id,original_title,original_language,vote_count
0,229,12,1.0,12,Finding Nemo,en,6292.0
1,229,70,3.0,70,Million Dollar Baby,en,2519.0
2,229,77,3.0,77,Memento,en,4168.0
3,229,85,3.0,85,Raiders of the Lost Ark,en,3949.0
4,229,106,4.0,106,Predator,en,2129.0


In [24]:
movies_ratings.shape

(189882, 7)

In [25]:
movies_ratings.isna().sum()

userId               0
movieId              0
rating               0
id                   0
original_title       0
original_language    0
vote_count           0
dtype: int64

#### Tirando valores duplicados

In [26]:
# Removendo valores duplicados para que não tenha o problema de um mesmo user avaliar o mesmo filme mais de uma vez
movies_ratings.drop_duplicates(['userId', 'movieId'], inplace = True)

In [27]:
movies_ratings.shape

(189882, 7)

#### Melhorando a tabela 'movies_ratings'

In [28]:
del movies_ratings['movieId']

In [29]:
# Queremos que cada UserId seja uma variável com o respectivo valor de nota para cada filme avaliado
movies_pivot = movies_ratings.pivot_table(columns='userId', index='original_title', values='vote_count')
movies_pivot.head()

userId,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,,,,,,,,,,,...,,1768.0,,1768.0,1768.0,,,,,
12 Angry Men,,,,,,,,,,,...,,,,,,,,,2130.0,
127 Hours,,,,,,,,,,,...,,,,,,,,,,
1408,,,,,,,,,,,...,,,,,1372.0,1372.0,,,,
2 Fast 2 Furious,,,,,,,,,,,...,,,,,,,,,,


In [30]:
movies_pivot.fillna(0, inplace=True)
movies_pivot.head()

userId,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1768.0,0.0,1768.0,1768.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2130.0,0.0
127 Hours,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1372.0,1372.0,0.0,0.0,0.0,0.0
2 Fast 2 Furious,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Começando o modelo preditivo

In [31]:
# criar uma mateiz esparsa
from scipy.sparse import csr_matrix

In [32]:
movies_sparse = csr_matrix(movies_pivot)

In [33]:
from sklearn.neighbors import NearestNeighbors

In [34]:
model = NearestNeighbors(algorithm = 'brute')
model.fit(movies_sparse)

#### Previsões de sugestões de filmes

##### Toy Story

In [44]:
distances, sugestions = model.kneighbors(movies_pivot.filter(items = ['Toy Story'], axis=0).values.reshape(1,-1))

for i in range(len(sugestions)):
    print(movies_pivot.index[sugestions[i]])

Index(['Toy Story', 'Dances with Wolves', 'Pretty Woman',
       'Live Free or Die Hard', 'Braveheart'],
      dtype='object', name='original_title')


##### 1408

In [45]:
distances, sugestions = model.kneighbors(movies_pivot.filter(items = ['1408'], axis=0).values.reshape(1,-1))

for i in range(len(sugestions)):
    print(movies_pivot.index[sugestions[i]])

Index(['1408', 'Casper', 'Once Upon a Time in America', 'Lord of War',
       'Snitch'],
      dtype='object', name='original_title')
