In [3]:
# Third-party Library Imports
import pandas as pd                  # Data processing
import numpy as np                   # Math
from typing import List, Tuple, Dict # Type hinting
import ast                           # Literal evaluation

# Model creation - PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Prediction making
from sklearn.metrics.pairwise import cosine_similarity
import math

In [5]:
articles_df = pd.read_csv("..//..//..//data/shared_articles.csv")
articles_df = articles_df[articles_df["eventType"] == "CONTENT SHARED"]
articles_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en
5,1459194522,CONTENT SHARED,-2826566343807132236,4340306774493623681,8940341205206233829,,,,HTML,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en


In [12]:
interactions_df = pd.read_csv("..//..//..//data/users_interactions.csv")
interactions_df.head(10)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,
5,1465413742,VIEW,310515487419366995,-8763398617720485024,1395789369402380392,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,MG,BR
6,1465415950,VIEW,-8864073373672512525,3609194402293569455,1143207167886864524,,,
7,1465415066,VIEW,-1492913151930215984,4254153380739593270,8743229464706506141,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR
8,1465413762,VIEW,310515487419366995,344280948527967603,-3167637573980064150,,,
9,1465413771,VIEW,3064370296170038610,3609194402293569455,1143207167886864524,,,


In [13]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

In [14]:
interactions_df.head(10)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,eventStrength
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1.0
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US,1.0
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,,1.0
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,,3.0
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,,1.0
5,1465413742,VIEW,310515487419366995,-8763398617720485024,1395789369402380392,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,MG,BR,1.0
6,1465415950,VIEW,-8864073373672512525,3609194402293569455,1143207167886864524,,,,1.0
7,1465415066,VIEW,-1492913151930215984,4254153380739593270,8743229464706506141,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR,1.0
8,1465413762,VIEW,310515487419366995,344280948527967603,-3167637573980064150,,,,1.0
9,1465413771,VIEW,3064370296170038610,3609194402293569455,1143207167886864524,,,,1.0


In [15]:
users_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 1895
# users with at least 5 interactions: 1140


In [16]:
print('# of interactions: %d' % len(interactions_df))
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 72312
# of interactions from users with at least 5 interactions: 69868


In [17]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['eventStrength'].sum() \
                    .apply(smooth_user_preference).reset_index()


unique_person_ids   = interactions_full_df['personId'].unique()
unique_article_ids  = interactions_full_df['contentId'].unique()

person_id_mapping   = {person_id: index for index, person_id in enumerate(unique_person_ids)}
article_id_mapping  = {article_id: index for index, article_id in enumerate(unique_article_ids)}

# Map the personIds in the DataFrame to integers
interactions_full_df['mapped_personId']  = interactions_full_df['personId'].map(person_id_mapping)
interactions_full_df['mapped_contentId'] = interactions_full_df['contentId'].map(article_id_mapping)

print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(100)

# of unique user/item interactions: 39106


Unnamed: 0,personId,contentId,eventStrength,mapped_personId,mapped_contentId
0,-9223121837663643404,-8949113594875411859,1.000000,0,0
1,-9223121837663643404,-8377626164558006982,1.000000,0,1
2,-9223121837663643404,-8208801367848627943,1.000000,0,2
3,-9223121837663643404,-8187220755213888616,1.000000,0,3
4,-9223121837663643404,-7423191370472335463,3.169925,0,4
...,...,...,...,...,...
95,-9172914609055320039,1999466695269209462,1.000000,6,91
96,-9172914609055320039,2081512200574495983,1.000000,6,92
97,-9172914609055320039,2470587255066232267,1.000000,6,93
98,-9172914609055320039,3375733407275108717,1.000000,6,94


In [18]:
interactions_full_df.shape

(39106, 5)

In [19]:
NUM_USERS    = interactions_full_df["personId"].nunique()
NUM_ARTICLES = interactions_full_df["contentId"].nunique() 

print(f"Num users: {NUM_USERS}")
print(f"Num articles: {NUM_ARTICLES}")

Num users: 1140
Num articles: 2984


In [20]:
users_items_pivot_matrix_df = interactions_full_df.pivot(index='mapped_personId', 
                                                          columns='mapped_contentId', 
                                                          values='eventStrength').fillna(0)

users_items_pivot_matrix_df.head(10)

mapped_contentId,0,1,2,3,4,5,6,7,8,9,...,2974,2975,2976,2977,2978,2979,2980,2981,2982,2983
mapped_personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,1.0,1.0,3.169925,1.0,1.0,1.0,1.0,1.584963,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
class CollaborativeFilteringModel(nn.Module):
    def __init__(self, num_users, num_articles, embedding_dim=10):
        super(CollaborativeFilteringModel, self).__init__()
        self.user_embeddings    = nn.Embedding(num_users,    embedding_dim)
        self.article_embeddings = nn.Embedding(num_articles, embedding_dim)

    def forward(self, user_id, article_id):
        #user_id = user_id.item()
        #article_id = article_id.item()

        user_embed    = self.user_embeddings(user_id.to(torch.long))
        article_embed = self.article_embeddings(article_id.to(torch.long))
        
        dot_product = torch.sum(user_embed * article_embed)
        return dot_product

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu" # Port models and data to GPU, if possible, for faster processing

model = CollaborativeFilteringModel(NUM_USERS, NUM_ARTICLES).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

In [23]:
dataset = interactions_full_df.to_numpy()

In [15]:
num_epochs = 30
min_loss = 1e18

for epoch in range(num_epochs):
    ind = 0
    total_loss = 0
    for _, _, true_rating, user_id, article_id in dataset:
        #print(user_id, article_id, true_rating)
        user_id     = torch.tensor(user_id,     dtype=torch.float32).to(device)
        article_id  = torch.tensor(article_id,  dtype=torch.float32).to(device)
        true_rating = torch.tensor(true_rating, dtype=torch.float32).to(device)

        predicted_rating = model(user_id, article_id)        
        loss = criterion(predicted_rating, true_rating)
        total_loss += loss.item()
            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (ind + 1) % 1000 == 0:
            print(ind + 1)
        ind += 1
        
    average_loss = total_loss / len(dataset)
    print(f"Epoch: #{epoch + 1: 5} Average loss: {average_loss}") 
    if average_loss < min_loss:
        torch.save(model.state_dict(), "model_testing_30epochs.pth") # Save model weights, in the end, so it can be used within the project
        print("MODEL SAVED")
        min_loss = average_loss

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
Epoch: #    1 Average loss: 10.465104723580202
MODEL SAVED
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
Epoch: #    2 Average loss: 6.855135979772426
MODEL SAVED
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
Epoch: #    3 Average loss: 4.949099679670946
MODEL SAVED
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
270

In [25]:
# Extract embeddings for articles and topics
user_embeddings = model.user_embeddings.weight.data.cpu().detach().numpy()
article_embeddings = model.article_embeddings.weight.data.cpu().detach().numpy()

# Normalize articles, so cosine similarity makes sense
#article_embeddings = article_embeddings / np.linalg.norm(article_embeddings, axis=1).reshape((-1, 1))

resulting_model = np.dot(user_embeddings, np.transpose(article_embeddings))

In [26]:
rdf = pd.DataFrame(resulting_model)

In [27]:
rdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2974,2975,2976,2977,2978,2979,2980,2981,2982,2983
0,1.054464,0.876797,1.307073,0.907233,1.249704,1.024968,1.520579,1.198120,1.583396,1.079303,...,1.598085,0.419656,-1.079966,1.750176,2.102481,0.237168,1.254380,-0.688821,0.536602,1.884646
1,1.481542,-0.657410,-1.076364,-0.586395,-0.368905,3.819562,-1.815341,0.565637,-0.200701,2.139812,...,1.507141,5.796224,8.697722,-0.687497,-2.147519,0.841394,-3.991184,-1.515902,6.721435,2.036528
2,0.771269,1.094787,1.775126,2.701546,1.518520,4.762135,0.224839,-1.685221,0.792877,-1.272486,...,0.874870,0.562683,-4.712303,3.318353,2.644733,-5.525280,2.873781,-0.096134,1.950646,5.435910
3,1.484099,-1.738583,1.581800,1.508416,1.914668,0.800640,1.532558,2.466308,1.249068,-0.756018,...,2.267398,-1.069869,-3.990031,0.054581,2.262521,-3.623005,3.602432,-1.902282,1.469932,3.088486
4,0.986859,0.871540,-0.080395,-1.022042,0.500456,0.952192,0.718923,0.198774,-1.103331,1.855404,...,-1.755862,2.781156,4.305604,4.750012,5.217992,-2.143749,-0.944856,1.794032,2.456340,0.753987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,1.029901,3.320011,-0.006196,-0.240406,-0.284682,3.375339,-0.101850,1.686689,1.059582,6.378237,...,1.366256,4.055940,4.819284,0.005436,4.076486,5.315945,-1.761547,-0.646860,4.342858,-0.338733
1136,1.677655,3.331847,1.429009,0.940324,0.796298,2.642996,1.346674,-1.214647,1.026742,1.076069,...,0.836312,2.153101,0.967990,5.886027,2.178553,0.637744,-0.651013,0.312865,0.775120,2.305364
1137,2.505403,2.260804,1.809022,0.413138,1.199643,3.442667,2.193805,-1.157429,1.321569,-0.720028,...,2.035621,1.842251,0.867094,3.279088,2.597172,0.602639,0.771448,-4.107899,1.971070,1.434357
1138,3.070556,0.835383,2.269547,1.011176,1.605198,-0.119949,2.532102,1.329278,1.881140,-2.395207,...,3.880168,-0.588613,-2.706857,3.336327,-0.970470,0.018538,1.850685,-4.126478,-0.366974,1.318870


In [28]:
with torch.no_grad():
    user_id    = torch.tensor(0).to(device) 
    article_id = torch.tensor(4).to(device)
    inference = model(user_id, article_id)

    print(f"[{user_id}, {article_id}]: {inference}")

[0, 4]: 1.2497038841247559


In [33]:
target_user = 9
row_values = rdf.iloc[target_user]
target_row = {col: value for col, value in enumerate(row_values)}
sorted_row = [(value, col) for col, value in sorted(target_row.items(), reverse=True, key=lambda item: item[1])]

print(sorted_row)

[(4.9432549476623535, 2205), (4.638181209564209, 1959), (4.543590068817139, 2616), (4.226729393005371, 738), (4.108290672302246, 2006), (4.050051212310791, 2526), (4.024713516235352, 2900), (3.904984951019287, 2937), (3.8405213356018066, 1044), (3.79921555519104, 2607), (3.788303852081299, 1992), (3.7125425338745117, 1860), (3.707636833190918, 1099), (3.6434621810913086, 426), (3.5556414127349854, 2327), (3.5305275917053223, 1952), (3.5139236450195312, 2036), (3.489546537399292, 294), (3.441159963607788, 2598), (3.4211394786834717, 691), (3.407433271408081, 2894), (3.404578924179077, 216), (3.3369736671447754, 914), (3.30329966545105, 2392), (3.3004331588745117, 2095), (3.2690935134887695, 776), (3.2575013637542725, 1725), (3.2330806255340576, 2682), (3.228691577911377, 1349), (3.1902267932891846, 2788), (3.179422378540039, 1406), (3.1759042739868164, 1320), (3.1579864025115967, 1812), (3.154757261276245, 1251), (3.1426053047180176, 1478), (3.1358835697174072, 1821), (3.121173858642578

In [34]:
TOP_N = 5 
top_articles = sorted_row[:TOP_N]
for rating, mapped_contentId in top_articles:
    print(rating, mapped_contentId)
    selected = interactions_full_df[interactions_full_df["mapped_contentId"] == mapped_contentId]["contentId"].unique()[0]
    found = articles_df[articles_df["contentId"] == selected]
    print(list(found["url"]))

4.9432549476623535 2205
['http://techcrunch.com/2016/04/28/plug-the-fathom-neural-compute-stick-into-any-usb-device-to-make-it-smarter/']
4.638181209564209 1959
['http://drauziovarella.com.br/drauzio/ai-que-preguica/']
4.543590068817139 2616
['http://gizmodo.com/darpa-wants-to-give-radio-waves-ai-to-stretch-bandwidth-1767678812']
4.226729393005371 738
['http://mashable.com/2016/06/12/fake-ted-talk/']
4.108290672302246 2006
['http://www.coindesk.com/us-judge-bitcoin-money-coin-mx/']
