# Modelagem
Neste notebook iremos desenvolver, testar e validas os modelos de recomendação musical, nesta MVP nossa maior preocupação não é a acurácia e sim o bom entendimento dos modelos e como melhorar-los.
Para tanto testaremos três técnicas:
* Popularidade
* Similaridade
* Filtragem colaborativa com SVD

Utilizaremos como direcional NSM (North Star Metric) uma relação de "peso" para a música entre a quantidade de vezes que aquele usuario ouviu e as vezes ouvidas no geral.



In [3]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [4]:
df = pd.read_parquet(r'data/processed_recommendation_data.parquet')

 ## Criando nossa north metric star

Nort star metric: definição de uma métrica de sucesso ou não para se guiar

vezes que o usuario ouviu / quantidade de vezes ouvidas

In [5]:
df['count_plays'] = df.groupby('id_tracks')['plays'].sum().reset_index()['plays']

In [6]:
df['count_plays'] = df['count_plays'].fillna(1)

In [7]:
C = df['plays'].mean()
m = df['count_plays'].quantile(0.95)

In [8]:
def weighted_rating(x):
    v = x['plays']
    R = x['count_plays']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [9]:
df['wr'] = df.apply(weighted_rating, axis=1)

In [10]:
df = df.sort_values('wr', ascending=False)
df

Unnamed: 0,id_date,user_id,id_tracks,plays,holiday,id_artist,id_genre,Feature1,Feature2,Feature3,...,Feature7,Feature8,Feature9,Feature10,acima_media_track,abaixo_media_track,acima_media_artist,abaixo_media_artist,count_plays,wr
3422,2022-02-08,7,1577,14,1,267.0,2.0,34.0,768.0,1091.0,...,230.0,1209.0,946.0,330.0,1,0,1,0,136.0,36.974972
1522,2021-10-24,38,3258,19,1,145.0,1.0,121.0,499.0,666.0,...,222.0,1884.0,686.0,388.0,1,0,1,0,95.0,32.934049
1808,2021-11-10,43,2417,18,1,392.0,4.0,210.0,870.0,696.0,...,237.0,711.0,198.0,585.0,1,0,1,0,97.0,32.557645
379,2021-08-20,36,1843,19,1,272.0,2.0,138.0,701.0,649.0,...,250.0,2291.0,1876.0,155.0,1,0,1,0,91.0,31.852198
2113,2021-11-27,11,2293,18,1,153.0,3.0,112.0,784.0,642.0,...,239.0,1625.0,1159.0,323.0,1,0,1,0,92.0,31.258006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4268,2022-03-30,15,1758,19,1,444.0,3.0,86.0,672.0,718.0,...,230.0,1244.0,440.0,1487.0,1,0,1,0,1.0,7.510561
11914,2023-06-12,3,958,19,1,304.0,2.0,127.0,227.0,715.0,...,238.0,1779.0,4141.0,1029.0,1,0,1,0,1.0,7.510561
9560,2023-01-28,36,2676,19,1,135.0,3.0,158.0,833.0,748.0,...,226.0,1989.0,3126.0,1877.0,1,0,1,0,1.0,7.510561
4015,2022-03-16,2,2705,19,1,82.0,3.0,149.0,697.0,1205.0,...,228.0,1454.0,1015.0,611.0,1,0,1,0,1.0,7.510561


## Content Based Recommender




In [11]:
interactions_train_df, interactions_test_df = train_test_split(df,
                                   stratify=df['user_id'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 10220
# interactions on Test set: 2556


## Modelo de Popularidade

In [12]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = df.set_index('user_id')
interactions_train_indexed_df = interactions_train_df.set_index('user_id')
interactions_test_indexed_df = interactions_test_df.set_index('user_id')

In [13]:
#Computes the most popular items
item_popularity_df = df.groupby('id_tracks')['wr'].sum().sort_values(ascending=False).reset_index()
item_popularity_df.head(10)

Unnamed: 0,id_tracks,wr
0,3564,120.492326
1,367,109.493559
2,447,105.42775
3,494,104.261963
4,48,103.369389
5,3857,102.706866
6,2665,101.480361
7,950,99.96618
8,2471,99.909894
9,379,99.822929


In [14]:

class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df.sort_values('wr', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'id_tracks', 
                                                          right_on = 'id_tracks')[['wr', 'id_tracks']]


        return recommendations_df
    
popularity_model = PopularityRecommender(item_popularity_df, df)

In [39]:
user_ids = df['user_id']

In [16]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['id_tracks']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [19]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(df['id_tracks'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['id_tracks']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['id_tracks'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['id_tracks'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    interactions_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['id_tracks'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['id_tracks'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 20)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 30)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
    
model_evaluator = ModelEvaluator()    

In [20]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...


since Python 3.9 and will be removed in a subsequent version.
  non_interacted_items_sample = random.sample(non_interacted_items, sample_size)


48 users processed

Global metrics:
{'modelName': 'Popularity', 'recall@5': 0.3363564668769716, 'recall@10': 0.47436908517350157}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
24,25,32,66,0.378788,0.484848,19
16,14,26,65,0.215385,0.4,31
15,26,32,62,0.419355,0.516129,10
10,26,33,62,0.419355,0.532258,13
44,25,28,61,0.409836,0.459016,4
2,17,26,58,0.293103,0.448276,49
7,25,33,57,0.438596,0.578947,37
33,21,29,56,0.375,0.517857,23
27,19,28,56,0.339286,0.5,34
23,21,31,56,0.375,0.553571,36


In [21]:
predict = df.groupby(['user_id']).apply(lambda grp: popularity_model.recommend_items(grp).head(2)).reset_index()
predict.columns = ['user_id', 'top1_2', 'id_tracks', 'wr']

In [22]:
predict.to_parquet(r'data/predict_recommendation_data.parquet')

Temos em média para recall hit 20: 33%% e recall 30: 47%

## Similarity method Cosine

In [24]:
df_student = df.drop('id_date', axis = 1).set_index(['user_id', 'id_tracks'])


In [25]:
sim = pd.DataFrame(cosine_similarity(df_student), 
                  index=df_student.index, columns=df_student.index).reset_index()



  index=df_student.index, columns=df_student.index).reset_index()
  index=df_student.index, columns=df_student.index).reset_index()


In [26]:
sim = sim.median().reset_index()

In [27]:
sim[sim.id_tracks != ''].sort_values(by = 'user_id')

Unnamed: 0,user_id,id_tracks,0
12114,1,672,0.775708
1034,1,2973,0.886527
8683,1,3581,0.847653
6086,1,2037,0.793047
6090,1,3413,0.863258
...,...,...,...
7156,49,968,0.862175
531,49,3153,0.830751
3197,49,2463,0.776300
4158,49,1604,0.855327


In [28]:
df_final = df[['id_tracks', 'user_id', 'wr']].merge(sim)

In [29]:
df_final.sort_values(by = ['id_tracks', 0], ascending=False)#.groupby(['user_id','id_tracks'])[['wr', 0]].head(3)

Unnamed: 0,id_tracks,user_id,wr,0
2398,3999,9,12.064301,0.727841
2725,3999,10,11.456394,0.727817
13222,3999,36,7.510561,0.727690
12364,3999,46,7.701347,0.727687
8003,3999,28,8.719273,0.727666
...,...,...,...,...
9250,1,26,8.467215,0.691689
6603,1,41,9.130968,0.691687
12808,1,46,7.604577,0.691687
350,1,43,19.595662,0.691621


O modelo de similaridade não foi testado em predição por não usar aprendizado de máquina, porem entendemos como uma estratégia eficiente de recomendação baseada nos gostos do usuário

## Collaborative Filtering mode

In [30]:
summarize_data_train = interactions_train_df.drop('id_date', axis = 1).groupby(['user_id', 'id_tracks'])['plays'].mean().reset_index()

In [31]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = summarize_data_train.pivot(index='user_id', 
                                                          columns='id_tracks', 
                                                          values='plays').fillna(0)

users_items_pivot_matrix_df.head(10)

id_tracks,1,4,5,6,7,8,9,10,11,12,...,3988,3989,3990,3991,3992,3994,3996,3997,3998,3999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,18.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
5,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.5,5.0
10,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,12.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0


In [32]:
minimum_rating = min(df['wr'].values)
maximum_rating = max(df['wr'].values)
print(minimum_rating,maximum_rating)

7.510561395142583 36.97497223001941


In [33]:
users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix_df)
users_items_pivot_sparse_matrix

<49x3672 sparse matrix of type '<class 'numpy.float64'>'
	with 9978 stored elements in Compressed Sparse Row format>

In [34]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
#U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [35]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [36]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 0.13807785,  1.23054746,  1.00909608, ..., -0.79154213,
        -0.93292332, -1.92852889],
       [ 3.69382057,  0.25396154,  0.47581564, ..., -0.61846761,
         0.38011216,  1.0596339 ],
       [ 1.43286586, -0.38451267, -0.13269151, ..., -3.04267287,
        -1.36919556,  2.00231154],
       ...,
       [ 1.58430296,  0.31926148,  0.33320907, ...,  0.53264677,
         1.20428311,  1.40522256],
       [ 0.85673527,  0.24110749,  0.35729737, ...,  0.49811056,
        -0.1670869 , -0.1524198 ],
       [ 0.01765075,  2.05460789, -0.06331902, ..., -0.62505709,
         1.33467525,  0.73457151]])

In [37]:
all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

In [46]:
users_ids = df['user_id'].unique()

In [47]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,7,38,43,36,11,10,46,33,49,14,...,12,18,24,3,22,47,35,32,44,23
id_tracks,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.259844,0.386162,0.305841,0.343489,0.310708,0.287487,0.224125,0.268664,0.266874,0.225381,...,0.303563,0.351988,0.242435,0.524772,0.277708,0.068746,0.397989,0.311221,0.285374,0.255565
4,0.298654,0.26396,0.241279,0.260605,0.252837,0.266487,0.272213,0.267718,0.281592,0.287922,...,0.265945,0.24985,0.277605,0.232706,0.251468,0.24388,0.2659,0.26628,0.263504,0.327929
5,0.290787,0.271842,0.250225,0.262488,0.260548,0.262354,0.252176,0.260759,0.255232,0.248835,...,0.269608,0.262045,0.262096,0.265489,0.260925,0.264152,0.261571,0.266776,0.267631,0.252689
6,0.224355,0.202747,0.284499,0.200914,0.234792,0.288593,0.197913,0.278001,0.288998,0.409522,...,0.22951,0.226183,0.302255,0.275537,0.282842,0.24854,0.273163,0.245988,0.257516,0.25885
7,0.259161,0.262408,0.254884,0.252836,0.257761,0.257742,0.256899,0.25742,0.258931,0.251958,...,0.25658,0.257597,0.254852,0.265803,0.254009,0.259661,0.248972,0.255479,0.259232,0.257351
8,0.339535,0.319037,0.230114,0.269632,0.284993,0.256635,0.281311,0.278596,0.288859,0.250273,...,0.321721,0.266436,0.265718,0.338236,0.272997,0.238109,0.242178,0.271989,0.246865,0.219234
9,0.241477,0.323022,0.25528,0.272501,0.276045,0.253059,0.309819,0.263839,0.285814,0.230558,...,0.28501,0.2712,0.238375,0.311663,0.257913,0.278908,0.221503,0.247662,0.256733,0.264543
10,0.241354,0.213738,0.230154,0.25016,0.246874,0.280841,0.253166,0.28646,0.239822,0.269311,...,0.271245,0.226087,0.274935,0.262474,0.306871,0.273271,0.257136,0.258827,0.24359,0.271528
11,0.17439,0.239448,0.283938,0.252358,0.249071,0.295833,0.231168,0.28753,0.27969,0.552125,...,0.254527,0.281503,0.278059,0.228197,0.266242,0.26262,0.27457,0.264663,0.262717,0.253762
12,0.277972,0.25035,0.256217,0.253083,0.257392,0.254372,0.254649,0.257893,0.261147,0.26333,...,0.258917,0.252673,0.261173,0.261574,0.256878,0.246762,0.257482,0.256856,0.249435,0.246424


In [48]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'wr'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions.sort_values('wr', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'id_tracks', 
                                                          right_on = 'id_tracks')[['plays', 'id_tracks']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, df)

In [49]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...


since Python 3.9 and will be removed in a subsequent version.
  non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
since Python 3.9 and will be removed in a subsequent version.
  non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
since Python 3.9 and will be removed in a subsequent version.
  non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
since Python 3.9 and will be removed in a subsequent version.
  non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
since Python 3.9 and will be removed in a subsequent version.
  non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
since Python 3.9 and will be removed in a subsequent version.
  non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
since Python 3.9 and will be removed in a subsequent version.
  non_interacted_items_sample = random.sample(non_interacted_items, sample_size)

48 users processed

Global metrics:
{'modelName': 'Collaborative Filtering', 'recall@5': 0.1833596214511041, 'recall@10': 0.27917981072555204}


since Python 3.9 and will be removed in a subsequent version.
  non_interacted_items_sample = random.sample(non_interacted_items, sample_size)


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
24,15,22,66,0.227273,0.333333,19
16,9,15,65,0.138462,0.230769,31
15,12,20,62,0.193548,0.322581,10
10,15,21,62,0.241935,0.33871,13
44,15,22,61,0.245902,0.360656,4
2,10,15,58,0.172414,0.258621,49
7,8,16,57,0.140351,0.280702,37
33,3,11,56,0.053571,0.196429,23
27,5,5,56,0.089286,0.089286,34
23,5,10,56,0.089286,0.178571,36


O modelo colaborativo com SVD apresentou recall 20 de 18% e recall 30 de 27%

## Conclusão

Neste notebook testamos três diferentes formas de recomendação, das quais duas foram baseadas no conteudo e uma baseada em filtragem colaborativa, o modelo que melhor desempenhou foi popularidade com 47% de recall, este resultado indica uma oportunidade de melhoria e re direcionamento do modelo, inclusive mesclar o calculo de similaridade com os proprios modelos executados.