In [1]:
# Importando a biblioteca e carregando os dados dos filmes
import pandas as pd
filmes = pd.read_csv("movies.csv")
filmes.columns = ["filmeId", "titulos", "generos"] # Alterando o nome das colunas para portugues
filmes = filmes.set_index("filmeId")
filmes.head()

Unnamed: 0_level_0,titulos,generos
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [2]:
#carregando dados de notas dos filmes
notas = pd.read_csv("ratings.csv")

notas.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# Alterando o nome das colunas para portugues
notas.columns = ["usuarioId", "filmeId", "nota", "momento"]
notas.head()

Unnamed: 0,usuarioId,filmeId,nota,momento
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
notas.describe()

Unnamed: 0,usuarioId,filmeId,nota,momento
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


## Criando a Recomendação

In [5]:
# contando quantas vezes o filme foi avaliado -> 1a coluna fimesId 2a coluna qtd de avaliações
notas["filmeId"].value_counts()

356       329
318       317
296       307
593       279
2571      278
         ... 
86279       1
86922       1
5962        1
87660       1
163981      1
Name: filmeId, Length: 9724, dtype: int64

In [6]:
filmes.loc[318]

titulos    Shawshank Redemption, The (1994)
generos                         Crime|Drama
Name: 318, dtype: object

In [7]:
total_de_votos = notas["filmeId"].value_counts()
total_de_votos.head()

356     329
318     317
296     307
593     279
2571    278
Name: filmeId, dtype: int64

In [8]:
filmes["total_de_votos"] = total_de_votos
filmes.head()

Unnamed: 0_level_0,titulos,generos,total_de_votos
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0
2,Jumanji (1995),Adventure|Children|Fantasy,110.0
3,Grumpier Old Men (1995),Comedy|Romance,52.0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0
5,Father of the Bride Part II (1995),Comedy,49.0


In [9]:
# listando os 10 filmes mais avaliados pelo total de votos
filmes.sort_values("total_de_votos", ascending=False).head(10)

Unnamed: 0_level_0,titulos,generos,total_de_votos
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0
110,Braveheart (1995),Action|Drama|War,237.0
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0
527,Schindler's List (1993),Drama|War,220.0


In [10]:
# verificando as notos dos filmes
notas.head()

Unnamed: 0,usuarioId,filmeId,nota,momento
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [11]:
# agrupando o dataframe notas pelos filmes
notas.groupby("filmeId")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f6eff49d310>

In [12]:
# tirando a media da nota do dataframe notas 
notas.groupby("filmeId").mean()["nota"]

filmeId
1         3.920930
2         3.431818
3         3.259615
4         2.357143
5         3.071429
            ...   
193581    4.000000
193583    3.500000
193585    3.500000
193587    3.500000
193609    4.000000
Name: nota, Length: 9724, dtype: float64

In [13]:
#criando as notas medias
notas_medias = notas.groupby("filmeId").mean()["nota"]
notas_medias.head()

filmeId
1    3.920930
2    3.431818
3    3.259615
4    2.357143
5    3.071429
Name: nota, dtype: float64

In [14]:
# inserindo as notas_medias dentro do dataframe dos filmes
# ordenar pelos mais polular em votos (total_de_votos), vai implicar nos que as pessoas mais gostaram (nota_media)
filmes["nota_media"] = notas_medias
filmes.sort_values("total_de_votos", ascending=False).head(10)

Unnamed: 0_level_0,titulos,generos,total_de_votos,nota_media
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0,4.164134
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0,4.197068
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0,4.16129
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0,4.192446
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0,4.231076
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0,3.75
110,Braveheart (1995),Action|Drama|War,237.0,4.031646
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0,3.970982
527,Schindler's List (1993),Drama|War,220.0,4.225


In [15]:
# ordenação pela nota_media -> pois qt maior for a nota, maior a probabilidade das pessoas gostarem do filme
# porém a nota media depende do total de votos
filmes.sort_values("nota_media", ascending=False).head(10)

Unnamed: 0_level_0,titulos,generos,total_de_votos,nota_media
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
88448,Paper Birds (Pájaros de papel) (2010),Comedy|Drama,1.0,5.0
100556,"Act of Killing, The (2012)",Documentary,1.0,5.0
143031,Jump In! (2007),Comedy|Drama|Romance,1.0,5.0
143511,Human (2015),Documentary,1.0,5.0
143559,L.A. Slasher (2015),Comedy|Crime|Fantasy,1.0,5.0
6201,Lady Jane (1986),Drama|Romance,1.0,5.0
102217,Bill Hicks: Revelations (1993),Comedy,1.0,5.0
102084,Justice League: Doom (2012),Action|Animation|Fantasy,1.0,5.0
6192,Open Hearts (Elsker dig for evigt) (2002),Romance,1.0,5.0
145994,Formula of Love (1984),Comedy,1.0,5.0


In [16]:
# filtro realizado pelo total de votos maior ou igual a 10 votos
filmes.query("total_de_votos >= 10")

Unnamed: 0_level_0,titulos,generos,total_de_votos,nota_media
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.920930
2,Jumanji (1995),Adventure|Children|Fantasy,110.0,3.431818
3,Grumpier Old Men (1995),Comedy|Romance,52.0,3.259615
5,Father of the Bride Part II (1995),Comedy,49.0,3.071429
6,Heat (1995),Action|Crime|Thriller,102.0,3.946078
...,...,...,...,...
174055,Dunkirk (2017),Action|Drama|Thriller|War,13.0,3.423077
176371,Blade Runner 2049 (2017),Sci-Fi,18.0,3.805556
177765,Coco (2017),Adventure|Animation|Children,13.0,3.538462
179819,Star Wars: The Last Jedi (2017),Action|Adventure|Fantasy|Sci-Fi,12.0,3.125000


In [18]:
# filtro realizado pelo total de votos maior que 100 votos
filmes.query("total_de_votos >= 100").sort_values("nota_media", ascending=False).head(10)

Unnamed: 0_level_0,titulos,generos,total_de_votos,nota_media
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
858,"Godfather, The (1972)",Crime|Drama,192.0,4.289062
2959,Fight Club (1999),Action|Crime|Drama|Thriller,218.0,4.272936
1221,"Godfather: Part II, The (1974)",Crime|Drama,129.0,4.25969
48516,"Departed, The (2006)",Crime|Drama|Thriller,107.0,4.252336
1213,Goodfellas (1990),Crime|Drama,126.0,4.25
912,Casablanca (1942),Drama|Romance,100.0,4.24
58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,149.0,4.238255
50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,204.0,4.237745
1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,142.0,4.232394
