In [25]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

In [26]:
articles_df = pd.read_csv("shared_articles.csv")
interactions_df = pd.read_csv('users_interactions.csv')

In [27]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3122 entries, 0 to 3121
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   timestamp        3122 non-null   int64 
 1   eventType        3122 non-null   object
 2   contentId        3122 non-null   int64 
 3   authorPersonId   3122 non-null   int64 
 4   authorSessionId  3122 non-null   int64 
 5   authorUserAgent  680 non-null    object
 6   authorRegion     680 non-null    object
 7   authorCountry    680 non-null    object
 8   contentType      3122 non-null   object
 9   url              3122 non-null   object
 10  title            3122 non-null   object
 11  text             3122 non-null   object
 12  lang             3122 non-null   object
dtypes: int64(4), object(9)
memory usage: 317.2+ KB


let`s remove 3 columns with a missing values:

In [28]:
articles_df.drop(['authorUserAgent','authorRegion','authorCountry'], axis=1, inplace=True)

In [29]:
articles_df.head(3)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en


In [30]:
articles_df['eventType'].value_counts()

eventType
CONTENT SHARED     3047
CONTENT REMOVED      75
Name: count, dtype: int64

will keep shared content only and also drop columns 'eventType' since it become redundant

In [31]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']

In [32]:
articles_df.drop('eventType', axis=1, inplace=True)

In [33]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3047 entries, 1 to 3121
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   timestamp        3047 non-null   int64 
 1   contentId        3047 non-null   int64 
 2   authorPersonId   3047 non-null   int64 
 3   authorSessionId  3047 non-null   int64 
 4   contentType      3047 non-null   object
 5   url              3047 non-null   object
 6   title            3047 non-null   object
 7   text             3047 non-null   object
 8   lang             3047 non-null   object
dtypes: int64(4), object(5)
memory usage: 238.0+ KB


let`s look at interactions DataFrame:

In [34]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72312 entries, 0 to 72311
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   timestamp    72312 non-null  int64 
 1   eventType    72312 non-null  object
 2   contentId    72312 non-null  int64 
 3   personId     72312 non-null  int64 
 4   sessionId    72312 non-null  int64 
 5   userAgent    56918 non-null  object
 6   userRegion   56907 non-null  object
 7   userCountry  56918 non-null  object
dtypes: int64(4), object(4)
memory usage: 4.4+ MB


time to merge both dataset. will not include columns like 'userAgent','userRegion','userCountry' 
because of the NaN values.

Doing merging on 'contentId' column

In [35]:
df = pd.merge(interactions_df[['contentId','personId','eventType']], articles_df[['contentId','title']],
             how = 'inner', on = 'contentId')

In [36]:
df

Unnamed: 0,contentId,personId,eventType,title
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
4,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
...,...,...,...,...
72264,-7108012586837980940,-4028919343899978105,VIEW,The Ultimate Digital Clean-Up Checklist: Are Y...
72265,7526977287801930517,-3643155458357242906,VIEW,Renewing Medium's focus
72266,-282629989972409543,5660542693104786364,VIEW,Santander Brasil gera 21% do resultado da matr...
72267,-6468782714472551646,5660542693104786364,VIEW,"Santander Brasil tem lucro gerencial de R$ 1,9..."


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72269 entries, 0 to 72268
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   contentId  72269 non-null  int64 
 1   personId   72269 non-null  int64 
 2   eventType  72269 non-null  object
 3   title      72269 non-null  object
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


let`s see what type of users interactions with articles we got:

In [38]:
df['eventType'].value_counts()

eventType
VIEW               61043
LIKE                5745
BOOKMARK            2463
COMMENT CREATED     1611
FOLLOW              1407
Name: count, dtype: int64

let`s give this interactions numeric points. where View only is 1.0 point, and following is 5.0

In [39]:
event_type_strength = {
    'VIEW': 1.0,
    'LIKE':2.0,
    'BOOKMARK':3.0,
    'COMMENT CREATED': 4.0,
    'FOLLOW':5.0,
}

In [40]:
df['eventStrength'] = df['eventType'].apply(lambda x: event_type_strength[x])

let`s look what we have:

In [41]:
df.head(5)

Unnamed: 0,contentId,personId,eventType,title,eventStrength
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem,1.0
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem,1.0
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem,1.0
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0
4,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0


at first two raws we could actually see duplicates. We need to check if there is another duplicates and if so we need to remove them

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72269 entries, 0 to 72268
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   contentId      72269 non-null  int64  
 1   personId       72269 non-null  int64  
 2   eventType      72269 non-null  object 
 3   title          72269 non-null  object 
 4   eventStrength  72269 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 2.8+ MB


In [43]:
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50910 entries, 0 to 72268
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   contentId      50910 non-null  int64  
 1   personId       50910 non-null  int64  
 2   eventType      50910 non-null  object 
 3   title          50910 non-null  object 
 4   eventStrength  50910 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 2.3+ MB


In [44]:
grouped_df = df.groupby(['personId','contentId','title']).sum().reset_index()

Now let`s see and sum up all interactions in groups for one person id, one content and title:

In [45]:
grouped_df.sample(5)

Unnamed: 0,personId,contentId,title,eventType,eventStrength
2945,-8051903121006324833,-692972306229904743,Blockchain won't kill banks: Bitcoin pioneer,VIEW,1.0
7558,-5444946123447491034,5843362068320522769,ClienteSA - Gente - Empatia e tecnologia,VIEW,1.0
23262,1120069409160402054,5439554346835640135,"Consórcio: Venda de novas cotas cai 13,5% no 1...",VIEW,1.0
12766,-2979537012405607453,7734121175534200554,"""Consultorias promovem a desvalorização do nos...",VIEW,1.0
4481,-7240735065448254654,-377975173223377441,"DARPA Goes ""Meta"" with Machine Learning for Ma...",VIEW,1.0


In [46]:
grouped_df.dtypes

personId           int64
contentId          int64
title             object
eventType         object
eventStrength    float64
dtype: object

let`s convert types into categories:

In [47]:
grouped_df['title'] = grouped_df['title'].astype('category')
grouped_df['personId'] = grouped_df['personId'].astype('category')
grouped_df['contentId'] = grouped_df['contentId'].astype('category')
grouped_df['person_id'] = grouped_df['personId'].cat.codes
grouped_df['content_id'] = grouped_df['contentId'].cat.codes

In [48]:
grouped_df.dtypes

personId         category
contentId        category
title            category
eventType          object
eventStrength     float64
person_id           int16
content_id          int16
dtype: object

In [49]:
grouped_df.head(5)

Unnamed: 0,personId,contentId,title,eventType,eventStrength,person_id,content_id
0,-9223121837663643404,-8949113594875411859,"No Brasil, '25% dos celulares ainda são 'Burro...",VIEW,1.0,0,65
1,-9223121837663643404,-8377626164558006982,Bad Writing Is Destroying Your Company's Produ...,VIEW,1.0,0,159
2,-9223121837663643404,-8208801367848627943,Ray Kurzweil: The world isn't getting worse - ...,VIEW,1.0,0,187
3,-9223121837663643404,-8187220755213888616,Organizing for digital acceleration: Making a ...,VIEW,1.0,0,195
4,-9223121837663643404,-7423191370472335463,"Espresso Intents: não é magia, é tecnologia! -...",VIEW,1.0,0,313


for further model training let`s create sparse matrices from grouped df as follows:

In [50]:
len(grouped_df['content_id'].unique())

2979

In [51]:
len(grouped_df['person_id'].unique())

1895

In [52]:
sparse_content_person = sparse.csr_matrix((grouped_df['eventStrength'].astype(float),
                                          (grouped_df['content_id'], grouped_df['person_id'])))
sparse_person_content = sparse.csr_matrix((grouped_df['eventStrength'].astype(float),
                                          (grouped_df['person_id'], grouped_df['content_id'])))
print(sparse_content_person.shape)
print(sparse_person_content.shape)

(2979, 1895)
(1895, 2979)


In [53]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

  check_blas_config()


In [54]:
# scaling the data
alpha = 15
data = (sparse_person_content * alpha).astype('double')

In [55]:
# Fit the model
model.fit(data)

  0%|          | 0/50 [00:00<?, ?it/s]

let`s try to find 10 similar articles to content with id 544:

In [56]:
content_id = 544
n_similar = 10

# These factors represent the learned user preferences
person_vecs = model.user_factors
# These factors represent the learned item characteristics
content_vecs = model.item_factors
# This calculates the Euclidean norm (or length) of each item's vector in the latent space. 
# These norms are used to normalize the similarity scores
content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1))
# similarity scores between the target content item and all other items
scores = content_vecs.dot(content_vecs[content_id]) / content_norms
# Finding Top Similar Items
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / content_norms[content_id]), key=lambda x: -x[1])

let`s print the similar articles to id=544

In [57]:
for content in similar:
    idx, score = content
    print(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])

Incrível, Skype se lembrou que o Linux existe!
Firebase Test Lab for Android
Automate Docker with the Remote API and Ruby
Windows 10 Anniversary SDK is bringing exciting opportunities to developers
Código Google: Introdução da próxima geração do Google Tag Manager e do Tag Manager 360 para Mobile Apps
New JavaScript library brings Java to browsers without applets
Java Tools and Technologies Landscape Report 2016 | zeroturnaround.com
Novidades do CDI 2.0
The Best Linux Distros of 2016
Dries Buytaert: Advancing Drupal's web services


In [59]:
def recommend(person_id, sparse_person_content, person_vecs, content_vecs, num_contents=10):
#     get the interactions scores fromm the sparse person content matrix
    person_interactions = sparse_person_content[person_id,:].toarray()
#     Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
#     make already interacted articles equel zero
    person_interactions[person_interactions > 1] = 0
#     Get dot product of person vector and all content vectors
    rec_vector = person_vecs[person_id,:].dot(content_vecs.T).toarray()
    
#     Scale this recommendatiion vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1, 1))[:,0]                                                     
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = person_interactions * rec_vector_scaled
    # Sort the indices of the content into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    # Start empty list to store titles and scores
    titles = []
    scores = []
    
    for idx in content_idx:
        # Append titles and scores to the list
        titles.append(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])
        scores.append(recommend_vector[idx])
    
    recommendations = pd.DataFrame({'title':titles, 'score': scores})
    
    return recommendations    

In [60]:
# get the trained person and content vectors. We convert them to csr matrices
person_vecs = sparse.csr_matrix(model.user_factors)
print(person_vecs.shape)

(1895, 20)


In [61]:
content_vecs = sparse.csr_matrix(model.item_factors)
print(content_vecs.shape)

(2979, 20)


In [62]:
# Create recommendations for person with id 76

person_id = 76

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)
print(recommendations)

                                               title     score
0  NodeMCU (ESP8266) o módulo que desbanca o Ardu...  1.000000
1  Governo define cronograma para plano nacional ...  0.939335
2  Par de alianças impede que seu parceiro assist...  0.933121
3  Pinterest's Real-World Pins Let In-Store Shopp...  0.930370
4  Google anuncia Android Things, plataforma para...  0.902849
5  Cannes Lions 2016: Tecido tecnológico do Googl...  0.885714
6  Lentes de contato inteligentes serão o wearabl...  0.876232
7  Dez habilidades profissionais mais valorizadas...  0.856778
8  This Battery-Free Computer Sucks Power Out Of ...  0.845792
9                         How to Get a Job at Google  0.844930


In [63]:
grouped_df.loc[grouped_df['person_id'] == 69].sort_values(by=['eventStrength'], ascending=False)[['title', 'eventStrength', 'person_id']]

Unnamed: 0,title,eventStrength,person_id
2299,Seria Stranger Things uma obra de arte do algo...,3.0,69
2298,New settings for 2-Step Verification,1.0,69


## Evaluating recommended system

In [64]:
import random

In [66]:
def make_train(ratings, pct_test=0.2):
    test_set = ratings.copy() # Make a copy os the origina set to be the test set
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy()
    
    nonzero_inds = training_set.nonzero() #finding the indices in the raatings dara with nonzero interactions
    #Zip these pairs 
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1]))
    
    random.seed(0)
    #round the number of samples needed
    num_samples = int(np.ceil(pct_test * len(nonzero_pairs)))
    #sample a random number item-user pairs without replacement
    samples = random.sample(nonzero_pairs, num_samples)
    
    #Get the item row indices
    content_inds = [index[0] for index in samples]
    
    #get the user column indices
    person_inds = [index[1] for index in samples]
    
    #assign all of the randomly chosen user-item pairs to zero
    training_set[content_inds, person_inds] = 0
    #get rid of zeros in sparse array atorage after update to save space
    training_set.eliminate_zeros()
    
    return training_set, test_set, list(set(person_inds))

In [67]:
content_train, content_test, сontent_persons_altered = make_train(sparse_content_person, pct_test =0.2)

In [68]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [69]:
def calc_mean_auc(training_set, altered_persons, predictions, test_set):
    store_auc = []
    popularity_auc = []
    pop_contents = np.array(test_set.sum(axis = 1)).reshape(-1)
    content_vecs = predictions[1]
    for person in altered_persons:
        training_column = training_set[:, person].toarray().reshape(-1)
        zero_inds = np.where(training_column == 0)
        
        person_vec = predictions[0][person,:]
        pred = person_vec.dot(content_vecs).toarray()[0, zero_inds].reshape(-1)
        
        actual = test_set[:, person].toarray()[zero_inds, 0].reshape(-1)
        
        pop = pop_contents[zero_inds]
        
        store_auc.append(auc_score(pred, actual))
        
        popularity_auc.append(auc_score(pop, actual))
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [70]:
calc_mean_auc(content_train, сontent_persons_altered, [person_vecs, content_vecs.T], content_test)

(0.981, 0.819)