<a href="https://colab.research.google.com/github/LucasMeirellesS/ItemKnnRecommendation/blob/main/ItemKnnRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [166]:
!pip install scikit-surprise



In [167]:
import os
import numpy as np
import pandas as pd
from surprise import Dataset, Reader
from surprise import KNNWithMeans

In [168]:
class KnnRecommendation:
  def __init__(self):
    self._dataframe = None
    self.train_set = None
    self.valid_set = None
    self.valid_set_surprise = None
    self.train_set_surprise =None
    self.model = None

  def set_parquet(self, path:str):
    self._dataframe = pd.read_parquet(path)


  @property
  def dataframe(self):
    return self._dataframe



  def show(self, head = False, v = 5):
    if head:
      display(self.dataframe.head(v))

    else:
      display(self._dataframe)


  def change_index(self, column):
    self.dataframe.set_index(column, inplace=True)


  def time_split_ratings(self, timestamp_column:str = 'timestamp', train_size:float = 0.8):
    df = self.dataframe.sort_values(by=timestamp_column, ascending=True)
    self.train_set, self.valid_set = np.split(df, [int(train_size * df.shape[0])])
    self.train_set = self.train_set.rename({'user_id':'userID', 'item_id':'itemID'}, axis=1)
    self.valid_set = self.valid_set.rename({'user_id':'userID', 'item_id':'itemID'}, axis=1)


  def convert_sets(self, rating_scale:tuple = (1, 5)):
    reader = Reader(rating_scale = rating_scale)

    self.train_set_surprise = (
        Dataset
        .load_from_df(self.train_set[['userID', 'itemID', 'rating']], reader)
        .build_full_trainset()
    )


    self.valid_set_surprise = (
        Dataset
        .load_from_df(self.valid_set[['userID', 'itemID', 'rating']], reader)
        .build_full_trainset()
        .build_testset()
    )


  def knn_model(self):

    sim_options = {
      "name":"pearson_baseline",
      "user_based": False
    }

    model = KNNWithMeans(k=40, sim_options = sim_options, verbose=True)
    self.model = model.fit(self.train_set_surprise)



  def valid_set_prediction(self):

    user_id = self.valid_set['userID'].sample().tolist()[0]
    item_id = self.valid_set['itemID'].sample().tolist()[0]

    self.valid_set['prediction'] = self.valid_set.apply(
        lambda x: self.model.predict(uid=x['userID'], iid=x['itemID']).est,
        axis=1
    )


  def user_predictions(self, user_id, item_ids:np.array, n=20):
    df_predictions = pd.DataFrame(columns=['item_id', 'score'])
    for item_id in item_ids:
      prediction = self.model.predict(uid=user_id, iid=item_id).est
      df_predictions.loc[df_predictions.shape[0]] = [item_id, prediction]

    user_predictions = (
        df_predictions
        .sort_values(by='score', ascending=False)
        .head(n)
        .set_index('item_id')
    )

    return user_predictions


  def get_kn(self, item_id, item_df, k=10):
    iid = self.model.trainset.to_inner_iid(item_id)
    neighbor_iids = self.model.get_neighbors(iid, k)
    item_ids = [self.model.trainset.to_raw_iid(iid) for iid in neighbor_iids]
    title = item_df.loc[item_id]['title']
    print (f'{k} vizinhos mais próximos de "{title}" (ID = {item_id})')
    display(item_df[item_df.index.isin(item_ids)])

# Testing

In [169]:
# Instanciando a classe
ratings = KnnRecommendation()
# Carregando dataset
ratings.set_parquet('/content/ratings.parquet')
ratings.show(True)

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [170]:
# Separando datasets de treino e validação
ratings.time_split_ratings()

In [171]:
# Observando dataset de treino
ratings.train_set

Unnamed: 0,userID,itemID,rating,timestamp
1000138,6040,858,4,956703932
1000153,6040,2384,4,956703954
999873,6040,593,5,956703954
1000007,6040,1961,4,956703977
1000192,6040,2019,5,956703977
...,...,...,...,...
314102,1875,802,4,975768718
314151,1875,892,4,975768719
314073,1875,440,4,975768738
314225,1875,509,4,975768738


In [172]:
# Observando dataset de validação
ratings.valid_set

Unnamed: 0,userID,itemID,rating,timestamp
314160,1875,1721,4,975768738
314032,1875,2621,4,975768748
94555,635,3095,5,975768756
94640,635,36,5,975768756
94578,635,608,5,975768756
...,...,...,...,...
825793,4958,2399,1,1046454338
825438,4958,1407,5,1046454443
825724,4958,3264,4,1046454548
825731,4958,2634,3,1046454548


In [173]:
# Convertendo os datasets em datasets para ser utilizado na biblioteca surprise para a criação do modelo
ratings.convert_sets()

In [174]:
# Criando modelo
ratings.knn_model()

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [175]:
# Criando coluna de predição para os dados de validação
ratings.valid_set_prediction()

In [176]:
# Observando set de validação
ratings.valid_set

Unnamed: 0,userID,itemID,rating,timestamp,prediction
314160,1875,1721,4,975768738,3.624856
314032,1875,2621,4,975768748,3.322002
94555,635,3095,5,975768756,4.688462
94640,635,36,5,975768756,4.192956
94578,635,608,5,975768756,4.303227
...,...,...,...,...,...
825793,4958,2399,1,1046454338,2.561691
825438,4958,1407,5,1046454443,3.552168
825724,4958,3264,4,1046454548,3.270295
825731,4958,2634,3,1046454548,2.754026


In [177]:
user_id = 1875
movies = KnnRecommendation()
movies.set_parquet('/content/movies.parquet')
movies.change_index('item_id')
recommendable_items = movies.dataframe.index
# Observando quais ids são mais parecidos com o id selecionado
ratings.user_predictions(user_id, recommendable_items, n =11)

Unnamed: 0_level_0,score
item_id,Unnamed: 1_level_1
1830.0,5.0
989.0,5.0
3522.0,5.0
3881.0,5.0
1471.0,5.0
3607.0,5.0
3382.0,5.0
3172.0,5.0
3656.0,5.0
557.0,5.0


In [178]:
#item_id = 1     # Toy Story
#item_id = 1356  # Star Trek: First Contact
item_id = 260   # Star Wars: Episode IV - A New Hope
# item_id = 3578  # Gladiator
ratings.get_kn(item_id, movies.dataframe)

10 vizinhos mais próximos de "Star Wars: Episode IV - A New Hope (1977)" (ID = 260)


Unnamed: 0_level_0,title,genres
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1198,Raiders of the Lost Ark (1981),Action|Adventure
1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
1291,Indiana Jones and the Last Crusade (1989),Action|Adventure
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
2628,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Fantasy|Sci-Fi
2640,Superman (1978),Action|Adventure|Sci-Fi
2716,Ghostbusters (1984),Comedy|Horror
3507,"Odd Couple, The (1968)",Comedy
3508,"Outlaw Josey Wales, The (1976)",Western
