# Projet 9 - My Content - systèmes de recommandation
<br>nb : les classes développées dans les scripts fournis sont ré-écrites pour information dans certaines cellules de ce document - il est naturellement prévu de les appeler depuis leur fichier .py et d'en utiliser les fonctions et attributs, leur code apparaît ici uniquement pour des raisons didactiques. <br>
### [I : Import et prétraitements](#partie_1)
### [II - Classe d'évaluation des algorithmes de recommandation par le recall](#partie_2)
### [III - Evaluation du recommender content-based](#partie_3)
### [IV - Evaluation du collaborative filtering](#partie_4)
### [V - Popularity rating](#partie_5)

<br><a id='partie_1'></a><br>
### I - Imports et prétraitements : <br>

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [42]:
# from importlib import reload
# reload(Content_based_recommender.CBF_Recommender())

In [2]:
%run loader.py

In [2]:
initial_clicks_db.user_id.unique()

array([0, 1, 2, ..., 322894, 322895, 322896], dtype=object)

Filtrage des utilisateurs sur la base du nombre d'interactions, puis échantillonnage de la moitié des id : 

In [3]:
# Minimum required number of interactions per user
threshold = 3
# Interactions count per user
vc = initial_clicks_db.user_id.value_counts()
# Keeping ids above interactions threshold
vc = vc[vc>=threshold]
# converting series to list
ids_with_at_least_n_clicks = vc.index.tolist()

In [4]:
import random
# Create filtered base df
filtered_initial_clicks_db = initial_clicks_db[initial_clicks_db.user_id.isin(ids_with_at_least_n_clicks)]
filtered_user_ids = filtered_initial_clicks_db.user_id.unique().tolist()
# Split data randomly : 
excluded_user_ids = random.sample(filtered_user_ids, len(filtered_user_ids)//2 )
leftover_data = filtered_initial_clicks_db[filtered_initial_clicks_db.user_id.isin(excluded_user_ids)]
filtered_initial_clicks_db = filtered_initial_clicks_db[~filtered_initial_clicks_db.user_id.isin(excluded_user_ids)]
# Recreate dependant tables : 
filtered_clicks_db = filtered_initial_clicks_db.set_index('user_id').sort_index(ascending=True)
filtered_dup_list = filtered_initial_clicks_db.duplicated(subset=['user_id','click_article_id'])

Vérification des dimensions des 2 tableaux obtenus : 

In [5]:
leftover_data.shape

(1393039, 12)

In [6]:
filtered_initial_clicks_db.shape

(1392286, 12)

Niveaux de consommations des utilisateurs et doublons, pour information - résultat actuel sur données non filtrées : 

In [22]:
article_consumptions = initial_clicks_db.loc[:,['user_id','click_article_id']].groupby('user_id').agg('count')
article_consumptions = pd.DataFrame(article_consumptions.value_counts()).rename(columns={0: 'Number_of_users_having_consumed_n_articles'})
article_consumptions.index.names = ['Articles_consumed (n)']
article_consumptions

Unnamed: 0_level_0,Number_of_users_having_consumed_n_articles
Articles_consumed (n),Unnamed: 1_level_1
6,4843
8,3616
5,3201
7,3166
9,2846
...,...
264,1
263,1
257,1
255,1


In [11]:
dup_list

0          False
1          False
2          False
3          False
4          False
           ...  
2988176    False
2988177    False
2988178    False
2988179    False
2988180    False
Length: 2988181, dtype: bool

<br><a id='partie_2'></a><br>
## II - Classe d'évaluation des algorithmes de recommandation par le recall : <br>

In [112]:
from sklearn.model_selection import train_test_split
import random
from IPython.display import clear_output

EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

def get_items_interacted(user_id, clicks_db):
    interacted_items = clicks_db.loc[user_id]['click_article_id']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

class ModelEvaluation:
    
    def __init__(self, clicks_df_full, trainset, testset, article_features_df): 
        self.user_indexed_full_df = clicks_df_full
        self.articles_df=article_features_df
        self.user_indexed_train_df = trainset
        self.user_indexed_test_df = testset
        
    def get_not_interacted_items_sample(self, user_id, sample_size, seed=42):
        interacted_items = get_items_interacted(user_id, self.user_indexed_full_df)
        all_items = set(self.articles_df.index)
        non_interacted_items = all_items - interacted_items
        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, top_n):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, top_n))
            return hit, index

    def evaluate_model_for_user(self, model, user_id):
        #Getting the items in test set
        interacted_values_testset = self.user_indexed_test_df.loc[user_id]
        if type(interacted_values_testset['click_article_id']) == pd.Series:
            user_interacted_items_testset = set(interacted_values_testset['click_article_id'])
        else:
            user_interacted_items_testset = set([int(interacted_values_testset['click_article_id'])])  
        interacted_items_count_testset = len(user_interacted_items_testset)

        #Getting a ranked recommendation list from a model for a given user
        user_recs_df = self.articles_df.loc[model.article_selection(user_id, top_n=20),:]
        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in user_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(user_id, 
                                                                          sample_size=100, 
                                                                          seed=item_id%(2**32))
            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))
            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = user_recs_df[user_recs_df.index.isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df.index.tolist()
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        user_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return user_metrics

    def evaluate_model(self, model, user_sample_size=None):
        #print('Running evaluation for users')
        people_metrics = []
        if user_sample_size:
            for idx, user_id in enumerate(random.sample(list(self.user_indexed_test_df.index.unique().values), user_sample_size)):
                print(f'now processing user {idx} of {user_sample_size}')
                clear_output(wait=True)
                user_metrics = self.evaluate_model_for_user(model, user_id) 
                user_metrics['_user_id'] = user_id
                people_metrics.append(user_metrics) 
        else : 
            for idx, user_id in enumerate(list(self.user_indexed_test_df.index.unique().values, )):
                print(f'now processing user {idx} of {len(self.user_indexed_test_df.index.unique().values)}')
                clear_output(wait=True)
                user_metrics = self.evaluate_model_for_user(model, user_id) 
                user_metrics['_user_id'] = user_id
                people_metrics.append(user_metrics)       
        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.model_name,
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df

<br><a id='partie_3'></a><br>
### III - Evaluation du recommender content-based : <br>

Train/test split : 

In [8]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(filtered_initial_clicks_db,
                                   stratify=filtered_initial_clicks_db['user_id'], 
                                   test_size=0.15,
                                   random_state=42)
user_indexed_train_df = train_df.set_index('user_id')
user_indexed_test_df = test_df.set_index('user_id')

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity


class CBF_Recommender():

    def __init__(self, initial_clicks_db, user_indexed_clicks_db, article_features):
        self.model_name = 'Content-based filtering'
        self.initial_clicks_db = initial_clicks_db
        self.clicks_db = user_indexed_clicks_db
        self.article_features = article_features
        self.dup_list = self.initial_clicks_db.duplicated(subset=['user_id','click_article_id']) 
    
    def cb_article_weights(self):
        clicks_db = self.initial_clicks_db.copy()
        clicks_db.loc[:,'cbr_weight'] = 1
        clicks_db.cbr_weight[self.dup_list] = clicks_db[self.dup_list].loc[:,'cbr_weight'].apply(lambda x : x + 2)
        return clicks_db.loc[:,'cbr_weight']

    def build_users_profile(self, user_id):
        clicks_weights = self.initial_clicks_db.copy()
        clicks_weights.loc[:,'weights'] = self.cb_article_weights()
        user_clicks_df = clicks_weights[clicks_weights.loc[:,'user_id'] == user_id]
        user_item_profiles = self.article_features.loc[user_clicks_df.drop_duplicates(subset='click_article_id')['click_article_id'],:]
        # print(f'user item profiles shape : {user_item_profiles.shape}')
        user_weights = np.array(user_clicks_df.groupby('click_article_id').agg('sum').loc[:,'weights']).reshape(-1,1)
        # print(f'user weights shape : {user_weights.shape}')
        weighted_user_item_profiles = user_item_profiles.multiply(user_weights)
        user_profile_norm = pd.DataFrame(normalize(weighted_user_item_profiles), index = [user_item_profiles.index], columns=user_item_profiles.columns)
        return user_profile_norm

    def build_users_profiles(self): 
        clicks_db = self.clicks_db.copy()
        interactions_indexed_df = clicks_db[clicks_db['click_article_id'] \
                                                       .isin(self.article_features.index)]
        user_profiles = {}
        for user_id in clicks_db.index.unique():
            user_profiles[user_id] = self.build_users_profile(user_id)
        return user_profiles

    def article_selection(self, user_id, top_n=50, verbose=False):
        user_profile = self.build_users_profile(user_id)
        cosine_similarities = cosine_similarity(user_profile, self.article_features)
        selected_indices = cosine_similarities.argsort().flatten()
        selection = sorted([(i, cosine_similarities[0,i]) for i in selected_indices if i not in user_profile.index.tolist()], key=lambda x: -x[1])        
        if verbose : 
            display(pd.DataFrame(selection))
        return [i[0] for i in selection[top_n:]]

    def display_selection_meta(self, user_selection, meta):
        return meta[meta.loc[:,'article_id'].isin(user_selection)]

Instanciation du recommender : 

In [35]:
reduced_features = article_features.iloc[:,:251]
cbf_recommender = CBF_Recommender(initial_clicks_db=train_df, user_indexed_clicks_db=user_indexed_train_df, article_features=reduced_features)

In [37]:
random_user = random.sample(train_df.user_id.unique().tolist(), 1)
delete_list = cbf_recommender.article_selection(random_user[0], top_n=50, verbose=True)

Unnamed: 0,0,1
0,337743,0.921630
1,337743,0.921630
2,337743,0.921630
3,336538,0.918965
4,336538,0.918965
...,...,...
1092127,37644,-0.273018
1092128,37644,-0.273018
1092129,79085,-0.310578
1092130,79085,-0.310578


Evaluation sur le jeu de test : 

In [39]:

model_evaluator = ModelEvaluation(clicks_db, user_indexed_train_df, user_indexed_test_df, reduced_features)

In [40]:
content_based_scores = model_evaluator.evaluate_model(cbf_recommender, 50)

now processing user 49 of 50


Score du recommender sur tous utilisateurs : 

In [40]:
content_based_scores[0]

{'modelName': 'Content-based filtering',
 'recall@5': 0.05040075202849792,
 'recall@10': 0.05040075202849792}

Score du recommender sur données seuillées à 3 interactions :  

In [41]:
content_based_scores[0]

{'modelName': 'Content-based filtering',
 'recall@5': 0.07482993197278912,
 'recall@10': 0.08843537414965986}

Score du recommender sur données seuillées à 5 interactions :  

In [144]:
model_evaluator.evaluate_model(cbf_recommender, 50)

({'modelName': 'Content-based filtering',
  'recall@5': 0.09375,
  'recall@10': 0.09375},
     hits@5_count  hits@10_count  interacted_count  recall@5  recall@10  \
 41             0              0                 8  0.000000   0.000000   
 38             1              1                 8  0.125000   0.125000   
 49             0              0                 6  0.000000   0.000000   
 9              0              0                 6  0.000000   0.000000   
 23             0              0                 6  0.000000   0.000000   
 26             1              1                 5  0.200000   0.200000   
 7              0              0                 5  0.000000   0.000000   
 16             0              0                 5  0.000000   0.000000   
 1              0              0                 4  0.000000   0.000000   
 25             2              2                 4  0.500000   0.500000   
 5              0              0                 4  0.000000   0.000000   
 48       

In [39]:
pd.DataFrame(content_based_scores[1])

Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_user_id
59,0,0,180,0.000000,0.000000,5890
2514,0,0,141,0.000000,0.000000,73574
2220,0,0,130,0.000000,0.000000,15867
934,2,2,112,0.017857,0.017857,15275
474,1,1,102,0.009804,0.009804,2151
...,...,...,...,...,...,...
43680,1,1,1,1.000000,1.000000,1447
43679,1,1,1,1.000000,1.000000,52266
43677,0,0,1,0.000000,0.000000,16885
43676,0,0,1,0.000000,0.000000,13178


Sauvegarde et lecture des prédictions à partir d'un fichier texte : 

In [68]:
import json

def save_user_prediction(user_id, top_n, out_dir):
    user_recs = cbf_recommender.article_selection(user_id, top_n=top_n)
    with open (f'{out_dir+"/"}user_{user_id}_predictions.txt', 'w') as outfile:
        json.dump(user_recs.tolist(), outfile)

save_user_prediction(15, 100, 'C:/users/Lewin/Downloads/OC/Projet_9/predictions')

In [79]:
def read_user_predictions(user_id, range_limits, files_dir):
    filepath = f'{files_dir+"/"}user_{user_id}_predictions.txt'
    with open (f'{files_dir+"/"}user_{user_id}_predictions.txt', 'r') as pred_file:
        pred_list = json.load(pred_file)
    return pred_list[range_limits[0]:range_limits[1]]

In [81]:
read_user_predictions(15, (0,5), 'C:/users/Lewin/Downloads/OC/Projet_9/predictions')

[96202, 96428, 96785, 96312, 97001]

Préparation des recommandations des 5000 premiers utilisateurs pour le démonstrateur : 

In [94]:
idxrange = train_df.user_id.unique()[0:5000]
out_dir = 'C:/users/Lewin/Downloads/OC/Projet_9/predictions'
for i, user in enumerate(idxrange):
    user_recs = cbf_recommender.article_selection(user, top_n=100)
    with open (f'{out_dir+"/"}user_{i}_predictions.txt', 'w') as outfile:
        json.dump(user_recs.tolist(), outfile)
    clear_output(wait=True)
    print(f'{i+1} users of {len(idxrange)} processed')  

5000 users of 5000 processed


<br><a id='partie_4'></a><br>
## IV - Evaluation du collaborative filtering : 

Classe CF_recommender : 

In [49]:
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np


class CF_Recommender():
    
    def __init__(self, clicks_data):
        self.model_name = 'Collaborative filtering'
        self.reader = Reader()
        self.clicks_data = clicks_data
        self.dup_list = self.clicks_data.duplicated(subset=['user_id','click_article_id'])
        self.cf_clicks_db = None
        self.data = None
        self.trainset = []
        self.algo = SVD()
        
    def cf_user_weights(self):
        clicks_db=self.clicks_data
        clicks_db.loc[:,'cf_weight'] = 1
#         clicks_db[self.dup_list]['cf_weight'] += 2
        clicks_db.cf_weight[self.dup_list] = clicks_db[self.dup_list].loc[:,'cf_weight'].apply(lambda x : x + 2)
#         clicks_db.cf_weight[clicks_db['click_country'].isin(clicks_db.loc[[user_id], 'click_country'])] = clicks_db.cf_weight[
#             clicks_db['click_country'].isin(clicks_db.loc[[user_id], 'click_country'])].apply(lambda x : x + 1)
#         clicks_db.cf_weight[clicks_db['click_region'].isin(clicks_db.loc[[user_id], 'click_region'])] = clicks_db.cf_weight[
#             clicks_db['click_region'].isin(clicks_db.loc[[user_id], 'click_region'])].apply(lambda x : x + 1)
        return clicks_db
    
    def fit_evaluate(self):
        self.cf_clicks_db = self.cf_user_weights(self.clicks_data)
        # merged = self.cf_clicks_db.merge(pd.DataFrame(self.dup_list, columns=['duplicate']), how = 'left', left_index=True, right_index=True)
        self.cf_clicks_db = self.cf_clicks_db.groupby(['user_id', 'click_article_id'])['cf_weight'].sum().reset_index()
        self.data = Dataset.load_from_df(self.cf_clicks_db, self.reader)
        self.trainset = self.data.build_full_trainset()
        self.algo.fit(self.trainset)     
        return self.algo

    def predict_for_user(self, user_id):
        predictions={}
        article_ids = []
        already_consumed = self.clicks_data[self.clicks_data['user_id']==user_id].loc[:,'click_article_id']
        known_users = [self.trainset.to_raw_uid(i) for i in self.trainset.all_users()]
        if user_id in known_users:
            for i in self.trainset.all_items():
                if self.trainset.to_raw_iid(i) not in already_consumed.values:
                    article_ids.append(self.trainset.to_raw_iid(i))
                    predictions[self.trainset.to_raw_iid(i)] = self.algo.predict(user_id, self.trainset.to_raw_iid(i), verbose=False)
        else:
            # use popularity filtering
            print('Unknown_ID, can not make prediction')
        return predictions
    
    def article_selection(self, user_id, top_n=20):
        hits = []
        predictions = self.predict_for_user(user_id)
        for i in predictions.keys():
            hits.append((predictions[i].iid, predictions[i].est))   
        hits.sort(key=lambda x: x[1], reverse=True)
        selection = [hits[i][0] for i in range(len(hits))]
        return selection[:top_n]
        

In [50]:
# from Collaborative_filtering_recommender import CF_Recommender
cf_recommender = CF_Recommender(train_df)

- Sans filtrage : 

In [51]:
 _ , rmse = cf_recommender.fit_evaluate()

RMSE: 0.3781


- Evaluation sur 50 utilisateurs, données filtrées à 3 interactions minimum : 

In [55]:
cf_model_evaluator = ModelEvaluation(clicks_db, user_indexed_train_df, user_indexed_test_df, article_features)

In [56]:
cf_scores = cf_model_evaluator.evaluate_model(cf_recommender, 50)

now processing user 49 of 50


In [57]:
cf_scores

({'modelName': 'Collaborative filtering', 'recall@5': 0.0, 'recall@10': 0.0},
     hits@5_count  hits@10_count  interacted_count  recall@5  recall@10  \
 36             0              0                39       0.0        0.0   
 27             0              0                39       0.0        0.0   
 45             0              0                33       0.0        0.0   
 0              0              0                31       0.0        0.0   
 43             0              0                28       0.0        0.0   
 10             0              0                27       0.0        0.0   
 13             0              0                27       0.0        0.0   
 24             0              0                24       0.0        0.0   
 34             0              0                23       0.0        0.0   
 47             0              0                21       0.0        0.0   
 21             0              0                21       0.0        0.0   
 1              0     

- Sans filtrage des données : 

In [80]:
cf_scores

({'modelName': 'Collaborative filtering', 'recall@5': 0.0, 'recall@10': 0.0},
     hits@5_count  hits@10_count  interacted_count  recall@5  recall@10  \
 9              0              0                51       0.0        0.0   
 19             0              0                32       0.0        0.0   
 13             0              0                32       0.0        0.0   
 1              0              0                30       0.0        0.0   
 32             0              0                28       0.0        0.0   
 28             0              0                27       0.0        0.0   
 49             0              0                26       0.0        0.0   
 11             0              0                24       0.0        0.0   
 31             0              0                23       0.0        0.0   
 27             0              0                22       0.0        0.0   
 20             0              0                21       0.0        0.0   
 29             0     

- Avec filtrage à 5 interactions minimum : 

In [152]:
cf_recommender.fit_evaluate()

RMSE: 0.4863


(<surprise.prediction_algorithms.matrix_factorization.SVD at 0x182b2faf8d0>,
 0.48633107645281565)

In [153]:
cf_model_evaluator = A_ModelEvaluation(clicks_db, user_indexed_train_df, user_indexed_test_df, article_features)

In [154]:
cf_scores = cf_model_evaluator.evaluate_model(cf_recommender, 50)

now processing user 49 of 50


In [155]:
cf_scores

({'modelName': 'Collaborative filtering', 'recall@5': 0.0, 'recall@10': 0.0},
     hits@5_count  hits@10_count  interacted_count  recall@5  recall@10  \
 32             0              0                10       0.0        0.0   
 26             0              0                 8       0.0        0.0   
 5              0              0                 6       0.0        0.0   
 27             0              0                 5       0.0        0.0   
 6              0              0                 5       0.0        0.0   
 15             0              0                 5       0.0        0.0   
 36             0              0                 5       0.0        0.0   
 1              0              0                 4       0.0        0.0   
 21             0              0                 4       0.0        0.0   
 38             0              0                 4       0.0        0.0   
 18             0              0                 4       0.0        0.0   
 16             0     

<br><a id='partie_5'></a><br>
La méthode SVD ne permet pas d'obtenir un recall mesurable à 5 ou 10 propositions. L'indice d'affinité construit par simple interaction est peut être simplement non pertinent pour notre problème - ou alors les articles consommés dans le testset ne sont pas les plus pertinents au vu de nos données d'apprentissage...

## V - Popularity rating : 

In [58]:
from Popularity_based_recommender import PopularityRecommender

In [80]:
class PopularityRecommender:
    
    def __init__(self, clicks_df, items_df=None):
        self.model_name = 'Popularity'
        self.clicks_df = clicks_df
        self.popularity_df = clicks_df.groupby('click_article_id')['click_country'].size().sort_values(ascending=False).reset_index()
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def article_selection(self, user_id, top_n=10):
        already_consumed = self.clicks_df[self.clicks_df.loc[:,'user_id'] == user_id]
        already_consumed = already_consumed.loc[:,'click_article_id']
        recommendations_df = self.popularity_df[~self.popularity_df['click_article_id'].isin(already_consumed)].head(top_n)
        return recommendations_df.click_article_id.tolist()

In [None]:
# train_data, test_data = train_test_split(initial_clicks_db,
#                                    stratify=lo_initial_clicks_db['user_id'], 
#                                    test_size=0.5,
#                                    random_state=42)
# user_indexed_train_data = train_data.set_index('user_id')
# user_indexed_test_data = test_data.set_index('user_id')

In [92]:
pbr = PopularityRecommender(train_df, article_features)

In [94]:
pbr_model_evaluator = ModelEvaluation(clicks_db, user_indexed_train_df, user_indexed_test_df, article_features)

In [96]:
pbr_scores = pbr_model_evaluator.evaluate_model(pbr, 5000)

now processing user 4999 of 5000


In [97]:
pbr_scores[0]

{'modelName': 'Popularity',
 'recall@5': 0.021988089784699953,
 'recall@10': 0.021988089784699953}

In [98]:
pd.DataFrame(pbr_scores[1])

Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_user_id
2690,0,0,518,0.000000,0.000000,4568
2085,3,3,200,0.015000,0.015000,6344
1364,1,1,196,0.005102,0.005102,376
726,4,4,167,0.023952,0.023952,17402
934,5,5,164,0.030488,0.030488,4343
...,...,...,...,...,...,...
270,0,0,3,0.000000,0.000000,286908
1086,0,0,3,0.000000,0.000000,179233
3240,0,0,3,0.000000,0.000000,274168
137,0,0,2,0.000000,0.000000,42775


La méthode génère quelques hits mais son score reste faible.
#### Quelques essais en plus - d'abord sur les données résiduelles : 

In [81]:
from sklearn.model_selection import train_test_split
# Filter only users with at most {threshold} interactions : 
threshold = 3
vc = initial_clicks_db.user_id.value_counts()
vc = vc[vc<=threshold]
ids_with_at_most_n_clicks = vc.index.tolist()
lo_initial_clicks_db = initial_clicks_db[initial_clicks_db.user_id.isin(ids_with_at_most_n_clicks)]
indexed_clicks = lo_initial_clicks_db.set_index('user_id').sort_index(ascending=True)
leftover_dup_list = lo_initial_clicks_db.duplicated(subset=['user_id','click_article_id'])
# Split data into train and test sets : 
train_data, test_data = train_test_split(lo_initial_clicks_db,
                                   stratify=lo_initial_clicks_db['user_id'], 
                                   test_size=0.5,
                                   random_state=42)
user_indexed_train_data = train_data.set_index('user_id')
user_indexed_test_data = test_data.set_index('user_id')

In [88]:
pbr = PopularityRecommender(train_data, article_features)

In [89]:
pbr_model_evaluator = A_ModelEvaluation(indexed_clicks, user_indexed_train_data, user_indexed_test_data, article_features)

In [90]:
pbr_scores = pbr_model_evaluator.evaluate_model(pbr)

now processing user 124087 of 124088


In [91]:
pbr_scores[0]

{'modelName': 'Popularity',
 'recall@5': 0.17626903365874552,
 'recall@10': 0.17626903365874552}

#### En seuillant les filtrage à 5 interactions max : 

In [99]:

# Filter only users with at most {threshold} interactions : 
threshold = 5
vc = initial_clicks_db.user_id.value_counts()
vc = vc[vc<=threshold]
ids_with_less_than_n_clicks = vc.index.tolist()
lo_initial_clicks_db = initial_clicks_db[initial_clicks_db.user_id.isin(ids_with_less_than_n_clicks)]
indexed_clicks = lo_initial_clicks_db.set_index('user_id').sort_index(ascending=True)
leftover_dup_list = lo_initial_clicks_db.duplicated(subset=['user_id','click_article_id'])
# Split data into train and test sets : 
train_data, test_data = train_test_split(lo_initial_clicks_db,
                                   stratify=lo_initial_clicks_db['user_id'], 
                                   test_size=0.5,
                                   random_state=42)
user_indexed_train_data = train_data.set_index('user_id')
user_indexed_test_data = test_data.set_index('user_id')

In [100]:
pbr = PopularityRecommender(train_data, article_features)

In [101]:
pbr_model_evaluator = A_ModelEvaluation(indexed_clicks, user_indexed_train_data, user_indexed_test_data, article_features)

In [103]:
pbr_scores = pbr_model_evaluator.evaluate_model(pbr, 5000)

now processing user 4999 of 5000


In [104]:
pbr_scores[0]

{'modelName': 'Popularity',
 'recall@5': 0.16718179263838157,
 'recall@10': 0.16718179263838157}

#### Sans filtrer les données : 

In [107]:

# Filter only users with at most {threshold} interactions : 
threshold = 200000
vc = initial_clicks_db.user_id.value_counts()
vc = vc[vc<=threshold]
ids_with_less_than_n_clicks = vc.index.tolist()
lo_initial_clicks_db = initial_clicks_db[initial_clicks_db.user_id.isin(ids_with_less_than_n_clicks)]
indexed_clicks = lo_initial_clicks_db.set_index('user_id').sort_index(ascending=True)
leftover_dup_list = lo_initial_clicks_db.duplicated(subset=['user_id','click_article_id'])
# Split data into train and test sets : 
train_data, test_data = train_test_split(lo_initial_clicks_db,
                                   stratify=lo_initial_clicks_db['user_id'], 
                                   test_size=0.2,
                                   random_state=42)
user_indexed_train_data = train_data.set_index('user_id')
user_indexed_test_data = test_data.set_index('user_id')

In [108]:
pbr = PopularityRecommender(train_data, article_features)

In [109]:
pbr_model_evaluator = A_ModelEvaluation(indexed_clicks, user_indexed_train_data, user_indexed_test_data, article_features)

In [110]:
pbr_scores = pbr_model_evaluator.evaluate_model(pbr, 5000)

now processing user 4999 of 5000


In [111]:
pbr_scores[0]

{'modelName': 'Popularity',
 'recall@5': 0.14110566205974143,
 'recall@10': 0.14110566205974143}

In [105]:
pbr_scores[1]

Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_user_id
4022,1,1,3,0.333333,0.333333,136775
3453,0,0,3,0.000000,0.000000,99373
4332,1,1,3,0.333333,0.333333,268090
1183,1,1,3,0.333333,0.333333,186626
1185,0,0,3,0.000000,0.000000,21013
...,...,...,...,...,...,...
2679,0,0,1,0.000000,0.000000,222363
2680,0,0,1,0.000000,0.000000,185956
2682,0,0,1,0.000000,0.000000,248535
2683,0,0,1,0.000000,0.000000,143261


Le filtrage par popularité obtient de bons scores pour les utilisateurs ayant peu d'interactions mais des scores faibles sur les utilisateurs ayant des interactions plus nombreuses - Il serait intéressant de définir un système hybride si ce modèle donne de bons résultats sur le grand nombre d'utilisateurs ayant eu moins de 3 ou 5 interactions. 