In [262]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import random
import os

In [263]:
articles = pd.read_csv("shared_articles.csv", delimiter= ',')
interactions = pd.read_csv("users_interactions.csv", delimiter= ',')

articles.head(10)
interactions.head(10)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,
5,1465413742,VIEW,310515487419366995,-8763398617720485024,1395789369402380392,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,MG,BR
6,1465415950,VIEW,-8864073373672512525,3609194402293569455,1143207167886864524,,,
7,1465415066,VIEW,-1492913151930215984,4254153380739593270,8743229464706506141,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR
8,1465413762,VIEW,310515487419366995,344280948527967603,-3167637573980064150,,,
9,1465413771,VIEW,3064370296170038610,3609194402293569455,1143207167886864524,,,


In [264]:
#1
#Pre-processing, labeling event ratings

#function that creates eventRatings
def rank_events(data, exists=True):
    if exists:
        data = pd.read_csv ('df.csv', delimiter= ',')
        return(data)
    else: 
        data['eventRating'] = ''

        for session in data['sessionId']:
            df1= data[data['sessionId']==session]
            for content in df1['contentId']:
                df = df1[df1['contentId']==content]
                if df.empty:
                    continue
                idx = df.index.values.tolist()
                if (df['eventType'].str.contains("LIKE")).any():
                    data.loc[idx, 'eventRating'] = 5.0
                elif ((df['eventType'].str.contains("FOLLOW")).any() & (df['eventType'].str.contains("BOOKMARK")).any()):
                    data.loc[idx, 'eventRating'] = 4.0
                elif ((df['eventType'].str.contains("FOLLOW")).any() | (df['eventType'].str.contains("BOOKMARK")).any() & ~((df['eventType'].str.contains("FOLLOW")).any() & (df['eventType'].str.contains("BOOKMARK")).any())):
                    data.loc[idx, 'eventRating'] = 3.0
                elif ((df['eventType'].str.contains("COMMENT CREATED")).any() & ~((df['eventType'].str.contains("BOOKMARK")).any() | (df['eventType'].str.contains("FOLLOW")).any() | (df['eventType'].str.contains("LIKE")).any())):
                    data.loc[idx, 'eventRating'] = 2.0
                elif ((df['eventType'].str.contains("VIEW")).any() & ~((df['eventType'].str.contains("COMMENT CREATED")).any() | (df['eventType'].str.contains("BOOKMARK")).any() | (df['eventType'].str.contains("FOLLOW")).any() | (df['eventType'].str.contains("LIKE")).any())):
                    data.loc[idx,'eventRating'] = 1.0
        data.to_csv('df.csv')            
        return(data)                       

#set to False if eventRatings has not been created yet
interactions = rank_events(interactions, os.path.isfile("df.csv"))
interactions["eventRating"].head(10)

0    1.0
1    1.0
2    5.0
3    5.0
4    1.0
5    5.0
6    1.0
7    1.0
8    5.0
9    1.0
Name: eventRating, dtype: float64

In [265]:
#1

#function that filters dataset according to wanted number of user and article interactions and computes popularity ranking of given set 
def filter_df(data, user_interactions_at_least, article_interactions_at_least, unique=True):

    #Selecting articles and users with enough interactions
    users_interactions_count = data.groupby(['personId', 'contentId']).size().groupby('personId').size()
    print('number of users:', len(users_interactions_count))
    users_with_enough_interactions_df = users_interactions_count[users_interactions_count >= user_interactions_at_least].reset_index()[['personId']]

    article_interactions_count = data.groupby(['personId', 'contentId']).size().groupby('contentId').size()
    articles_with_enough_interactions_df = article_interactions_count[article_interactions_count >= article_interactions_at_least].reset_index()[['contentId']]

    print("articles with at least", article_interactions_at_least, "interactions:", len(articles_with_enough_interactions_df))
    print("users with at least", user_interactions_at_least, "interactions:", len(users_with_enough_interactions_df))
    print('number of interactions:', len(data))
    interactions_from_selected_users_df = data.merge(users_with_enough_interactions_df, 
                how = 'right',
                left_on = 'personId',
                right_on = 'personId')
    print('number of interactions from users with at least 3 interactions:', len(interactions_from_selected_users_df))


    interactions_from_selected_users_df = interactions_from_selected_users_df.merge(articles_with_enough_interactions_df, 
                how = 'right',
                left_on = 'contentId',
                right_on = 'contentId')
    print("number of interactions users with at least 3 interactions and with articles of at least 2 interactions", len(interactions_from_selected_users_df))


    #select unique item/user interactions
    if unique:
        interactions_filtered = interactions_from_selected_users_df.groupby(['personId', 'contentId'])['eventRating'].sum().reset_index() #-->0.32
    else:
        interactions_filtered = interactions_from_selected_users_df #--> 0.13

    print('number of of unique user/item interactions:', len(interactions_filtered))

    #compute item popularity ranking
    item_popularity = interactions_selected.groupby('contentId')['eventRating'].sum().sort_values(ascending=False).reset_index()

    return(interactions_filtered, item_popularity)

interactions_selected, item_popularity = filter_df(data=interactions, user_interactions_at_least=3, article_interactions_at_least=2, unique=True)
interactions_selected.head(10)        

number of users: 1895
articles with at least 2 interactions: 2744
users with at least 3 interactions: 1400
number of interactions: 72312
number of interactions from users with at least 3 interactions: 71244
number of interactions users with at least 3 interactions and with articles of at least 2 interactions 70849
number of of unique user/item interactions: 39754


Unnamed: 0,personId,contentId,eventRating
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,8.0
5,-9223121837663643404,-7331393944609614247,1.0
6,-9223121837663643404,-6872546942144599345,1.0
7,-9223121837663643404,-6728844082024523434,1.0
8,-9223121837663643404,-6590819806697898649,1.0
9,-9223121837663643404,-6558712014192834002,2.0


In [266]:
#indexing by personId
pre_processed_data_index = interactions_selected.set_index('personId')

#train-test split
train_set, test_set = train_test_split(interactions_selected,  stratify=interactions_selected['personId'], train_size=0.80)
train_set_index = train_set.set_index('personId')
test_set_index = test_set.set_index('personId')

print('Train set size:', len(train_set))
print('Test set size:', len(test_set))

Train set size: 31803
Test set size: 7951


In [267]:
#2
class Item_based:
    
    
    MODEL_NAME = 'Item-based model'
        
    def __init__(self, popularity, items=None):
            self.item_popularity = popularity
            self.items = items
            
    def get_model_name(self):
            return self.MODEL_NAME
            
    def recommend_items(self, user_id, items_to_ignore=[], topn=101, verbose=False):
        recommendations = self.item_popularity[~self.item_popularity['contentId'].isin(items_to_ignore)].sort_values('eventRating', ascending = False).head(topn)

        if verbose:
            if self.items is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations = recommendations.merge(self.items, how = 'left', left_on = 'contentId', right_on = 'contentId')[['eventRating', 'contentId', 'title', 'url', 'lang']]


        return recommendations


item_based_model = Item_based(item_popularity, articles)
item_popularity.head(10)

Unnamed: 0,contentId,eventRating
0,-4029704725707465084,984.0
1,-2358756719610361882,673.0
2,2857117417189640073,665.0
3,-1633984990770981161,655.0
4,8224860111193157980,650.0
5,-6783772548752091658,644.0
6,1854874463930846880,615.0
7,-8208801367848627943,609.0
8,-1297580205670251233,606.0
9,-6843047699859121724,605.0


In [301]:
class Recall_n:

    def get_items_interacted(self, person_id, interactions_df):
        # Get the user's data and merge in the movie information.
        interacted_items = interactions_df.loc[person_id]['contentId']
        return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

    def get_not_interacted_items_sample(self, person_id, sample_size, seed=12):
        interacted_items = self.get_items_interacted(person_id, pre_processed_data_index)
        all_items = set(articles['contentId'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(list(non_interacted_items), sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = test_set_index.loc[person_id]
        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=self.get_items_interacted(person_id, train_set_index), topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=101, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['contentId'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['contentId'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(test_set_index.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics).sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
recall_n_evaluator = Recall_n() 

In [295]:
class NDCG:

    def dcg_at_k(self, r, k, method=0):
        """Score is discounted cumulative gain (dcg)
        Relevance is positive real values.  Can use binary
        as the previous methods.
        Example from
        http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
        >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
        >>> dcg_at_k(r, 1)
        3.0
        >>> dcg_at_k(r, 1, method=1)
        3.0
        >>> dcg_at_k(r, 2)
        5.0
        >>> dcg_at_k(r, 2, method=1)
        4.2618595071429155
        >>> dcg_at_k(r, 10)
        9.6051177391888114
        >>> dcg_at_k(r, 11)
        9.6051177391888114
        Args:
            r: Relevance scores (list or numpy) in rank order
                (first element is the first item)
            k: Number of results to consider
            method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                    If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
        Returns:
            Discounted cumulative gain
        """
        r = np.asfarray(r)[:k]
        if r.size:
            if method == 0:
                return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
            elif method == 1:
                return np.sum(r / np.log2(np.arange(2, r.size + 2)))
            else:
                raise ValueError('method must be 0 or 1.')
        return 0.


    def ndcg_at_k(self, r, k, method=0):
        """Score is normalized discounted cumulative gain (ndcg)
        Relevance is positive real values.  Can use binary
        as the previous methods.
        Example from
        http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
        >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
        >>> ndcg_at_k(r, 1)
        1.0
        >>> r = [2, 1, 2, 0]
        >>> ndcg_at_k(r, 4)
        0.9203032077642922
        >>> ndcg_at_k(r, 4, method=1)
        0.96519546960144276
        >>> ndcg_at_k([0], 1)
        0.0
        >>> ndcg_at_k([1], 2)
        1.0
        Args:
            r: Relevance scores (list or numpy) in rank order
                (first element is the first item)
            k: Number of results to consider
            method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                    If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
        Returns:
            Normalized discounted cumulative gain
        """
        dcg_max = self.dcg_at_k(sorted(r, reverse=True), k, method)
        if not dcg_max:
            return 0.
        return self.dcg_at_k(r, k, method) / dcg_max
    
    def get_items_interacted(self, person_id, interactions_df):
        # Get the user's data and merge in the movie information.
        interacted_items = interactions_df.loc[person_id]['contentId']
        return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

    def evaluate_model_for_user(self, model, person_id, k):
        #Getting the items in test set
        interacted_values_testset = test_set_index.loc[person_id]
        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user

        personal_recommendations = model.recommend_items(person_id, items_to_ignore=self.get_items_interacted(person_id, train_set_index), topn=10000)
        ndcg = self.ndcg_at_k(personal_recommendations['eventRating'], k)

        random.seed(12)
        personal_recommendations_rand = random.sample(list(interactions_selected['eventRating']), 1000)
        ndcg_rand = self.ndcg_at_k(personal_recommendations_rand, k)

        person_metrics = {'ndcg':ndcg,
                          'ndcg_rand': ndcg_rand,
                          'comp_rand': (ndcg-ndcg_rand),
                          'interacted_count': interacted_items_count_testset}
        return person_metrics

    def evaluate_model(self, model, k):
            #print('Running evaluation for users')
            people_metrics = []
            for idx, person_id in enumerate(list(test_set_index.index.unique().values)):
                person_metrics = self.evaluate_model_for_user(model, person_id, k)  
                person_metrics['_person_id'] = person_id
                people_metrics.append(person_metrics)
            print('%d users processed' % idx)

            detailed_results_df = pd.DataFrame(people_metrics).sort_values('interacted_count', ascending=False)
            ndcg = detailed_results_df['ndcg'].sum() / len(detailed_results_df)
            ndcg_comp = detailed_results_df['comp_rand'].sum() / len(detailed_results_df)
            global_metrics = {'modelName': model.get_model_name(),
                            'ndcg': ndcg,
                            'ndcg_comp': ndcg_comp}    
            return global_metrics, detailed_results_df    

recall_ndcg_evaluator = NDCG() 

In [300]:
#Evaluating Item-based model

print('Item-based recommendation model: Recall_N:')
pop_global_metrics, pop_detailed_results_df = recall_n_evaluator.evaluate_model(item_based_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Item-based recommendation model: Recall_N:
1333 users processed

Global metrics:
{'modelName': 'Item-based model', 'recall@5': 0.20953339202616023, 'recall@10': 0.9437806565211923}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
22,22,190,190,0.115789,1.0,3609194402293569455
27,10,133,133,0.075188,1.0,-2626634673110551643
13,10,127,127,0.07874,1.0,-1032019229384696495
32,9,115,115,0.078261,1.0,-1443636648652872475
189,11,86,86,0.127907,1.0,-2979881261169775358
196,10,80,80,0.125,1.0,-3596626804281480007
0,13,73,73,0.178082,1.0,1116121227607581999
70,17,69,69,0.246377,1.0,692689608292948411
191,9,68,68,0.132353,1.0,3636910968448833585
246,14,67,67,0.208955,1.0,-9016528795238256703


In [304]:
pop_global_metrics, pop_detailed_results_df = recall_ndcg_evaluator.evaluate_model(item_based_model, k=101)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

1333 users processed

Global metrics:
{'modelName': 'Item-based model', 'ndcg': 1.0, 'ndcg_comp': 0.8626244060016229}


Unnamed: 0,ndcg,ndcg_rand,comp_rand,interacted_count,_person_id
22,1.0,0.137376,0.862624,190,3609194402293569455
27,1.0,0.137376,0.862624,133,-2626634673110551643
13,1.0,0.137376,0.862624,127,-1032019229384696495
32,1.0,0.137376,0.862624,115,-1443636648652872475
189,1.0,0.137376,0.862624,86,-2979881261169775358
196,1.0,0.137376,0.862624,80,-3596626804281480007
0,1.0,0.137376,0.862624,73,1116121227607581999
70,1.0,0.137376,0.862624,69,692689608292948411
191,1.0,0.137376,0.862624,68,3636910968448833585
246,1.0,0.137376,0.862624,67,-9016528795238256703
