In [127]:
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import wordnet as wn
from nltk import edit_distance as ed
from scipy.spatial.distance import cosine
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import FastText
import torch
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import pandas as pd
import os

In [128]:
articles = pd.read_csv("shared_articles.csv", delimiter= ',')
interactions = pd.read_csv("users_interactions.csv", delimiter= ',')

articles.head(10)
interactions.head(10)



Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,
5,1465413742,VIEW,310515487419366995,-8763398617720485024,1395789369402380392,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,MG,BR
6,1465415950,VIEW,-8864073373672512525,3609194402293569455,1143207167886864524,,,
7,1465415066,VIEW,-1492913151930215984,4254153380739593270,8743229464706506141,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR
8,1465413762,VIEW,310515487419366995,344280948527967603,-3167637573980064150,,,
9,1465413771,VIEW,3064370296170038610,3609194402293569455,1143207167886864524,,,


In [129]:
#1
#Pre-processing, labeling event ratings

def pre_process(data, exists=True):
    if exists:
        data = pd.read_csv ('df.csv', delimiter= ',')
        return(data)
    else: 
        data['eventRating'] = ''

        for session in data['sessionId']:
            df1= data[interactions['sessionId']==session]
            for content in df1['contentId']:
                df = df1[df1['contentId']==content]
                if df.empty:
                    continue
                idx = df.index.values.tolist()
                if (df['eventType'].str.contains("LIKE")).any():
                    data.loc[idx, 'eventRating'] = 5.0
                elif ((df['eventType'].str.contains("FOLLOW")).any() & (df['eventType'].str.contains("BOOKMARK")).any()):
                    data.loc[idx, 'eventRating'] = 4.0
                elif ((df['eventType'].str.contains("FOLLOW")).any() | (df['eventType'].str.contains("BOOKMARK")).any() & ~((df['eventType'].str.contains("FOLLOW")).any() & (df['eventType'].str.contains("BOOKMARK")).any())):
                    data.loc[idx, 'eventRating'] = 3.0
                elif ((df['eventType'].str.contains("COMMENT CREATED")).any() & ~((df['eventType'].str.contains("BOOKMARK")).any() | (df['eventType'].str.contains("FOLLOW")).any() | (df['eventType'].str.contains("LIKE")).any())):
                    data.loc[idx, 'eventRating'] = 2.0
                elif ((df['eventType'].str.contains("VIEW")).any() & ~((df['eventType'].str.contains("COMMENT CREATED")).any() | (df['eventType'].str.contains("BOOKMARK")).any() | (df['eventType'].str.contains("FOLLOW")).any() | (df['eventType'].str.contains("LIKE")).any())):
                    data.loc[idx,'eventRating'] = 1.0
        interactions.to_csv('df.csv')            
        return(data)                       

#set to False if eventRatings has not been created yet
interactions = pre_process(interactions, os.path.isfile("df.csv"))
print(interactions.head(1))
interactions["eventRating"].head(10)
np.mean(interactions["eventRating"])

   Unnamed: 0   timestamp eventType            contentId             personId  \
0           0  1465413032      VIEW -3499919498720038879 -8845298781299428018   

             sessionId userAgent userRegion userCountry  eventRating  
0  1264196770339959068       NaN        NaN         NaN          1.0  


2.073971125124461

In [130]:
#1

user_interactions_at_least = 3
article_interactions_at_least = 2

#Selecting articles and users with enough interactions

users_interactions_count = interactions.groupby(['personId', 'contentId']).size().groupby('personId').size()
print('# users: %d' % len(users_interactions_count))
users_with_enough_interactions_df = users_interactions_count[users_interactions_count >= user_interactions_at_least].reset_index()[['personId']]

article_interactions_count = interactions.groupby(['personId', 'contentId']).size().groupby('contentId').size()
articles_with_enough_interactions_df = article_interactions_count[article_interactions_count >= article_interactions_at_least].reset_index()[['contentId']]

print("articles with at least", article_interactions_at_least, "interactions:", len(articles_with_enough_interactions_df))
print("users with at least", user_interactions_at_least, "interactions:", len(users_with_enough_interactions_df))
pre_processed_data = interactions[(interactions.index.isin(users_with_enough_interactions_df.index)) & interactions.index.isin(articles_with_enough_interactions_df.index)]
print(len(pre_processed_data))

# users: 1895
articles with at least 2 interactions: 2744
users with at least 3 interactions: 1400
1400


In [131]:

print('# of interactions: %d' % len(interactions))
interactions_from_selected_users_df = interactions.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')
print('# of interactions from users with at least 3 interactions: %d' % len(interactions_from_selected_users_df))

interactions_full_df = interactions_from_selected_users_df.groupby(['personId', 'contentId'])['eventRating'].sum().reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(10)

articles_full =  articles.merge(articles_with_enough_interactions_df, 
               how = 'right',
               left_on = 'contentId',
               right_on = 'contentId')
print('# of interactions from users with at least 3 interactions and articles with at least 2 interactions: %d' % len(articles_full))

# of interactions: 72312
# of interactions from users with at least 3 interactions: 71244
# of unique user/item interactions: 39995
# of interactions from users with at least 3 interactions and articles with at least 2 interactions: 2760


In [132]:
#Indexing by personId
pre_processed_data_index = interactions_full_df.set_index('personId')
train_set, test_set = train_test_split(interactions_full_df,  stratify=interactions_full_df['personId'], train_size=0.80)
print(test_set["eventRating"])
train_set_index = train_set.set_index('personId')
test_set_index = test_set.set_index('personId')

print('# interactions on Train set: %d' % len(train_set))
print('# interactions on Test set: %d' % len(test_set))

37513    1.0
23752    1.0
20616    1.0
14249    2.0
723      2.0
        ... 
18599    2.0
8993     1.0
28091    3.0
38568    4.0
29632    1.0
Name: eventRating, Length: 7999, dtype: float64
# interactions on Train set: 31996
# interactions on Test set: 7999


In [133]:
#Computes the most popular items
item_popularity = pre_processed_data.groupby('contentId')['eventRating'].sum().sort_values(ascending=False).reset_index()
item_popularity.head(10)

Unnamed: 0,contentId,eventRating
0,1738052593226421681,138.0
1,1854874463930846880,101.0
2,2285214528595997209,97.0
3,310515487419366995,95.0
4,8657408509986329668,69.0
5,880612740433495828,62.0
6,-4205346868684833897,59.0
7,-5148591903395022444,57.0
8,6850500272809381909,45.0
9,1862503310075246782,45.0


In [134]:
#2
class PopularityRecommender:
    MODEL_NAME = 'Popularity'
        
    def __init__(self, popularity_df, items_df=None):
            self.popularity_df = popularity_df
            self.items_df = items_df
            
    def get_model_name(self):
            return self.MODEL_NAME
            
    def recommend_items(self, user_id, items_to_ignore=[], topn=101, verbose=False):
            # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['contentId'].isin(items_to_ignore)].sort_values('eventRating', ascending = False).head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                            left_on = 'contentId', 
                                                            right_on = 'contentId')[['eventRating', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
popularity_model = PopularityRecommender(item_popularity, articles_full)

In [135]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['contentId']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])


In [136]:
import random

EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=12):
        interacted_items = get_items_interacted(person_id, pre_processed_data_index)
        all_items = set(articles_full['contentId'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(list(non_interacted_items), sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = test_set_index.loc[person_id]
        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, train_set_index), topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['contentId'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['contentId'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(test_set_index.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics).sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator() 

In [137]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...
1399 users processed

Global metrics:
{'modelName': 'Popularity', 'recall@5': 0.07975996999624953, 'recall@10': 0.18014751843980498}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@101,_person_id
108,5,20,192,0.026042,0.104167,3609194402293569455
45,22,39,134,0.164179,0.291045,-2626634673110551643
10,28,41,130,0.215385,0.315385,-1032019229384696495
150,7,23,117,0.059829,0.196581,-1443636648652872475
31,2,6,87,0.022989,0.068966,-2979881261169775358
26,2,13,80,0.025,0.1625,-3596626804281480007
223,4,9,73,0.054795,0.123288,1116121227607581999
52,6,16,69,0.086957,0.231884,692689608292948411
59,1,11,68,0.014706,0.161765,-9016528795238256703
95,7,19,68,0.102941,0.279412,3636910968448833585
