<a href="https://colab.research.google.com/github/Guilherm0/Recommendation-System/blob/main/RecomendacaoFilmes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dinâmica Sistemas de Recomendação

A partir da base de dados de filmes (movies) e de avaliações (ratings), faça:

*   Uma lista dos dez filmes mais bem avaliados
*   Uma lista com os filmes que obtiveram o maior número de avaliações
*   Uma lista que retorne os dez filmes que possuem a maior similaridade com um selecionado



# Importando as bibliotecas e base de dados 

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from google.colab import files

## Upload arquivo `movies.parquet`

In [None]:
%%time
_ = files.upload()

Saving movies.parquet to movies.parquet
CPU times: user 350 ms, sys: 34.8 ms, total: 384 ms
Wall time: 27.3 s


In [None]:
df_movies = pd.read_parquet('./movies.parquet')
df_movies.tail()

Unnamed: 0,item_id,title,genres
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller


## Upload arquivo `ratings.parquet`

In [None]:
%%time
_ = files.upload()

Saving ratings.parquet to ratings.parquet
CPU times: user 856 ms, sys: 114 ms, total: 969 ms
Wall time: 53.1 s


In [None]:
df_ratings = pd.read_parquet('ratings.parquet')
df_ratings.drop(columns= 'timestamp', axis = 1, inplace= True)
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4


# Lista dos 10 filmes melhores avaliados 

In [None]:
def recomenda_top_n_melhores_avaliados(df_ratings:pd.DataFrame, n:int, aggregation_fn='mean') -> pd.DataFrame:
  """Recomenda os N itens com melhor avaliação média"""
  df_top_n = (
    df_ratings
    .groupby('item_id')['rating'].agg(aggregation_fn)
    .reset_index()
    .rename({'rating': 'score'}, axis=1)
    .sort_values(by='score', ascending=False)
    .head(n)
  )

  return df_top_n

recommendations = recomenda_top_n_melhores_avaliados(df_ratings, n=10, aggregation_fn='sum')
recommendations

Unnamed: 0,item_id,score
2651,2858,14800
253,260,13321
1106,1196,12836
1120,1210,11598
1848,2028,11507
1108,1198,11257
579,593,11219
2374,2571,11178
2557,2762,10835
575,589,10751


In [None]:
recommendations.merge(df_movies, on='item_id')

Unnamed: 0,item_id,score,title,genres
0,2858,14800,American Beauty (1999),Comedy|Drama
1,260,13321,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
2,1196,12836,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
3,1210,11598,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
4,2028,11507,Saving Private Ryan (1998),Action|Drama|War
5,1198,11257,Raiders of the Lost Ark (1981),Action|Adventure
6,593,11219,"Silence of the Lambs, The (1991)",Drama|Thriller
7,2571,11178,"Matrix, The (1999)",Action|Sci-Fi|Thriller
8,2762,10835,"Sixth Sense, The (1999)",Thriller
9,589,10751,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller


# Lista dos 10 filmes com maior número de avaliações

In [None]:
def recomenda_top_n_mais_avaliados(df_ratings:pd.DataFrame, n:int, aggregation_fn='count') -> pd.DataFrame:
  """Recomenda os N itens com mais avaliações"""
  df_top_n = (
    df_ratings
    .groupby('item_id')['user_id'].agg(aggregation_fn)
    .reset_index()
    .rename({'user_id': 'score'}, axis=1)
    .sort_values(by='score', ascending=False)
    .head(n)
  )

  return df_top_n

recommendations = recomenda_top_n_mais_avaliados(df_ratings, n=10, aggregation_fn='count')
recommendations

Unnamed: 0,item_id,score
2651,2858,3428
253,260,2991
1106,1196,2990
1120,1210,2883
466,480,2672
1848,2028,2653
575,589,2649
2374,2571,2590
1178,1270,2583
579,593,2578


In [None]:
recommendations.merge(df_movies, on='item_id')

Unnamed: 0,item_id,score,title,genres
0,2858,3428,American Beauty (1999),Comedy|Drama
1,260,2991,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
2,1196,2990,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
3,1210,2883,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
4,480,2672,Jurassic Park (1993),Action|Adventure|Sci-Fi
5,2028,2653,Saving Private Ryan (1998),Action|Drama|War
6,589,2649,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
7,2571,2590,"Matrix, The (1999)",Action|Sci-Fi|Thriller
8,1270,2583,Back to the Future (1985),Comedy|Sci-Fi
9,593,2578,"Silence of the Lambs, The (1991)",Drama|Thriller


# Lista dos 10 filmes mais similares a um selecionado


## Obtendo matriz de preferências

In [None]:
n_users = 100
most_active_users = (
    df_ratings
    .groupby('user_id')['item_id']
    .count()
    .sort_values(ascending=False)
    .head(n_users)
    .index
)

In [None]:
df_preference_matrix = (
  df_ratings
  .pivot(index='user_id', columns='item_id', values='rating')
  .fillna(0)
)

df_preference_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Obtendo matriz de similaridades

A matriz de similaridade pode ser definida de diversas formas:

- Podemos utilizar diferentes funções de similaridade
- Podemos considerar somente determinados usuários da matriz de preferências

In [None]:
def get_similarity_matrix(df_preference_matrix:pd.DataFrame, users:np.array=None) -> pd.DataFrame:
  """ Gera matriz de similaridades a partir da matriz de preferências para todos
      os usuários ou usuários selecionados
  """
  item_ids = df_preference_matrix.columns

  if users is None:
    users = df_preference_matrix.index

  similarity_matrix = cosine_similarity(df_preference_matrix.loc[users].T)
  similarity_matrix = pd.DataFrame(
    similarity_matrix,
    index = item_ids,
    columns = item_ids
  )

  return similarity_matrix

similarity_matrix = get_similarity_matrix(df_preference_matrix, users=most_active_users)
similarity_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.816004,0.655785,0.486212,0.577512,0.740496,0.674944,0.266086,0.416212,0.779954,...,0.226201,0.096449,0.152672,0.274859,0.253344,0.626519,0.500270,0.295220,0.094576,0.524326
2,0.816004,1.000000,0.645950,0.491628,0.604820,0.649039,0.613204,0.236849,0.457723,0.745428,...,0.132054,0.000000,0.160605,0.331589,0.216851,0.587297,0.375442,0.281139,0.055714,0.502790
3,0.655785,0.645950,1.000000,0.314722,0.521498,0.550389,0.571353,0.245097,0.503411,0.638633,...,0.133682,0.131812,0.158972,0.239769,0.138100,0.553326,0.378960,0.153701,0.086168,0.383724
4,0.486212,0.491628,0.314722,1.000000,0.490173,0.445300,0.470149,0.109492,0.258621,0.414018,...,0.050872,0.000000,0.018149,0.065695,0.108110,0.399568,0.306234,0.116365,0.141658,0.337185
5,0.577512,0.604820,0.521498,0.490173,1.000000,0.373896,0.590999,0.187179,0.369100,0.552587,...,0.067475,0.106449,0.112335,0.212996,0.043017,0.492618,0.169239,0.127393,0.093944,0.401446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.626519,0.587297,0.553326,0.399568,0.492618,0.563593,0.562125,0.358796,0.328813,0.605495,...,0.301350,0.151729,0.194429,0.418595,0.344047,1.000000,0.636236,0.380621,0.200856,0.694343
3949,0.500270,0.375442,0.378960,0.306234,0.169239,0.507773,0.380928,0.284378,0.179760,0.430708,...,0.302327,0.198732,0.143808,0.187979,0.349795,0.636236,1.000000,0.482982,0.194873,0.513670
3950,0.295220,0.281139,0.153701,0.116365,0.127393,0.280424,0.202107,0.312581,0.027756,0.298674,...,0.124484,0.368230,0.000000,0.167454,0.396816,0.380621,0.482982,1.000000,0.288863,0.397927
3951,0.094576,0.055714,0.086168,0.141658,0.093944,0.121919,0.115920,0.000000,0.000000,0.152708,...,0.232048,0.000000,0.000000,0.023783,0.052835,0.200856,0.194873,0.288863,1.000000,0.121104


## Gerando recomendações

In [None]:
def recomenda_top_n_similares(item_id:int, similarity_matrix:pd.DataFrame, n:int):
  assert item_id in similarity_matrix.index
  item_similarities = similarity_matrix.loc[item_id]
  top_n_similars = (
      item_similarities
      .sort_values(ascending=False)
      .head(n)
      .reset_index()
      .rename({item_id: 'score'}, axis=1)
  )
  return top_n_similars

recommendations = recomenda_top_n_similares(item_id=3951, similarity_matrix=similarity_matrix, n=10)
recommendations

Unnamed: 0,item_id,score
0,3951,1.0
1,3119,0.784465
2,3652,0.784465
3,1773,0.761042
4,3855,0.719023
5,3569,0.655521
6,1502,0.653846
7,2481,0.638153
8,623,0.588348
9,807,0.588348


In [None]:
recommendations.merge(df_movies, on='item_id')

Unnamed: 0,item_id,score,title,genres
0,3951,1.0,Two Family House (2000),Drama
1,3119,0.784465,Bay of Blood (Reazione a catena) (1971),Horror
2,3652,0.784465,City of the Living Dead (Paura nella città dei...,Horror
3,1773,0.761042,Tokyo Fist (1995),Action|Drama
4,3855,0.719023,"Affair of Love, An (Une Liaison Pornographique...",Drama|Romance
5,3569,0.655521,"Idiots, The (Idioterne) (1998)",Comedy|Drama
6,1502,0.653846,Kissed (1996),Romance
7,2481,0.638153,My Name Is Joe (1998),Drama|Romance
8,623,0.588348,"Modern Affair, A (1995)",Romance
9,807,0.588348,"Rendezvous in Paris (Rendez-vous de Paris, Les...",Comedy|Romance


## Playground

In [None]:
df_movies.head(20)

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
item_id = 1 # Toy Story
# item_id = 2 # Jumanji
# item_id = 17 # Sense and Sensibility

recommendations = recomenda_top_n_similares(item_id=item_id, similarity_matrix=similarity_matrix, n=10)
recommendations.merge(df_movies, on='item_id')


Unnamed: 0,item_id,score,title,genres
0,1,1.0,Toy Story (1995),Animation|Children's|Comedy
1,1210,0.921122,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
2,260,0.92111,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
3,1270,0.917306,Back to the Future (1985),Comedy|Sci-Fi
4,1200,0.910321,Aliens (1986),Action|Sci-Fi|Thriller|War
5,1196,0.910242,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
6,2716,0.908923,Ghostbusters (1984),Comedy|Horror
7,480,0.904979,Jurassic Park (1993),Action|Adventure|Sci-Fi
8,1580,0.904129,Men in Black (1997),Action|Adventure|Comedy|Sci-Fi
9,1073,0.90271,Willy Wonka and the Chocolate Factory (1971),Adventure|Children's|Comedy|Fantasy
