### Global params

In [2]:
%load_ext autoreload
import pandas as pd
k_closest_users = 2   # Number of closest-users

### Pre-process

In [3]:
%%time
%autoreload 2
from ratingmatrix import rating_matrix
from user_keywords import load_content_data, load_events_data, merge_on_docID, get_user_counts, get_user_profile

df_content = load_content_data('data/content_refine')
df_content = df_content.rename(columns={"id": "documentId"})
# TODO: using num_days for testing
df_events = load_events_data('data/active1000', num_days=None)
df_events = df_events.drop(['title', 'publishtime'], axis=1)
df_merged = merge_on_docID(df_events, df_content)
df_user_frequencies = get_user_counts(df_merged)
df_user_profile = get_user_profile(df_user_frequencies)
df_user_profile.to_pickle('data/matrix-df/user_profiles.pkl')
df_user_item = rating_matrix(df_events)
df_user_item.to_pickle('data/matrix-df/user_item.pkl')


Wall time: 1min 9s


### Load preprocessed matrices + train_test

In [4]:
%autoreload 2
from ratingmatrix import rating_matrix_train_test_split

df_user_profile = pd.read_pickle('data/matrix-df/user_profiles.pkl')
df_user_profile.head()

df_user_item = pd.read_pickle('data/matrix-df/user_item.pkl')
df_user_item.head()

index = df_user_item.index
columns = df_user_item.columns
train, test = rating_matrix_train_test_split(df_user_item.to_numpy(), fraction=0.2)

# Train and test are disjoint (in terms of the 1-values)
train = pd.DataFrame(data=train, index=index, columns=columns)
test = pd.DataFrame(data=test, index=index, columns=columns)

## Method


In [5]:
%%time
from recommender_methods import collaborative_filtering_user_based

# List with users and their recommended articles
recommendations = []

# The number of requests for recommendations
num_recommendations = 0

# Returns a list with recommended articles
#for user in range(len(df_user_profile.index)):
for user in range(len(df_user_profile.index)):
    num_recommendations += 1
    recommended = collaborative_filtering_user_based(df_user_profile, train, train.index[user], k_closest_users)
    recommendations.append(recommended)

predict = test.copy(deep=True)
predict[:] = 0.0
for user in recommendations:
    for article in user['articles']:
        predict.at[user['user_id'], article] = 1.0

predict.head()


Wall time: 8min 35s


Unnamed: 0,9f3999bd1a1a8d67bcb073ad54840f15cb30f014,ae167d304a4ef49e874389abcba68636b0011f85,735628ed2428f2a7b2a1b78f95f3c6d777e1865b,ca1952721582ff9e2b6d3555c26ee81a3f3f8fdb,7b98a1ddfc682d87d9f2f867a27406252773d548,70a19fd7c9f6827feb3eb4f3df95121664491fa7,2a00c43cf84f7c433431027845505a0fdc77a55d,7dbbd7f3a7ec287bdcbdaf8b8f042732074bf2bd,d9670bfb1b63a9a13f1cae2b962671a9b00bfbdc,9d615dd08d92c8e9670fb72b5c78cbc6b52501c4,...,ecbabf102a768c656b0f3c5311473502e8c0af46,6b72989c492637215c966b0697137a63169e3d8e,f9d69c516c52bf21c6ea6392074028460d344dd2,8672d7148b5e2879a49e7d30032b5775a7b3b67c,43135f601138a5402191191a208d27c53c3da371,e175a205d1f88d5d1500af335a15602fa90249ed,1d1f28e9bf0c9559c15afc4c26b2bc9b5935c55b,3c302729ddf1deffb4562816ee27c40e19de650f,0a471cc4f12cb9247ce6f4ad6a3b03e27254ac0c,39574aa90590c97905445de3001ece8068493a93
cx:il0sdznsjgg9uxgy:3bi2ksost85yi,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cx:ijtjdxpz93t5f8m5:2iz9n4nuh22ky,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cx:ep1jmztcea6w2dtv4kax775yy:1sm7epvktvk6k,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cx:i2i1zmdjyfqb4ugf:2x51wx1f20jgz,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cx:ik8tj30jnl47bzl6:25hcdqs8lvpwa,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
from evaluation import evaluate
evaluate(recommendations, test, 5)
evaluate(recommendations, test, 10)
evaluate(recommendations, test, 20)
evaluate(recommendations, test, 100)
evaluate(recommendations, test)






Average recall@5 is 0.0030
Normal recall@5 is 0.3970
ARHR@5 is 0.2111
CTR@5 is 7.9400%

Average recall@10 is 0.0051
Normal recall@10 is 0.6660
ARHR@10 is 0.2450
CTR@10 is 6.6600%

Average recall@20 is 0.0096
Normal recall@20 is 1.2410
ARHR@20 is 0.2830
CTR@20 is 6.2050%

Average recall@100 is 0.0474
Normal recall@100 is 6.1770
ARHR@100 is 0.3821
CTR@100 is 6.1770%

Average recall is 0.4272
Normal recall is 58.6590
ARHR is 0.5192
CTR is 6.2282%
