### Global params

In [15]:
%load_ext autoreload
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Pre-process

In [16]:
%%time
%autoreload 2
from ratingmatrix import rating_matrix
from user_keywords import load_content_data, load_events_data, merge_on_docID, get_user_counts, get_user_profile

df_content = load_content_data('data/content_refine')
df_content = df_content.rename(columns={"id": "documentId"})
# TODO: using num_days for testing
df_events = load_events_data('data/active1000', num_days=None)
df_events = df_events.drop(['title', 'publishtime'], axis=1)
df_merged = merge_on_docID(df_events, df_content)
df_user_frequencies = get_user_counts(df_merged)
df_user_profile = get_user_profile(df_user_frequencies)
df_user_profile.to_pickle('data/matrix-df/user_profiles.pkl')
df_user_item = rating_matrix(df_events)
df_user_item.to_pickle('data/matrix-df/user_item.pkl')




Wall time: 1min 9s


### Load preprocessed matrices + train_test

In [17]:
%autoreload 2
from ratingmatrix import rating_matrix_train_test_split

df_user_profile = pd.read_pickle('data/matrix-df/user_profiles_day1.pkl')
df_user_profile.head()

df_user_item = pd.read_pickle('data/matrix-df/user_item_day1.pkl')
df_user_item.head()

index = df_user_item.index
columns = df_user_item.columns
train, test = rating_matrix_train_test_split(df_user_item.to_numpy(), fraction=0.2)

# Train and test are disjoint (in terms of the 1-values)
train = pd.DataFrame(data=train, index=index, columns=columns)
test = pd.DataFrame(data=test, index=index, columns=columns)


## Method


In [25]:
from recommender_methods import collaborative_filtering_user_based

# List with users and their recommended articles
recommendations = []

# Returns a list with recommended articles
for user in range(10):
    recommended = collaborative_filtering_user_based(df_user_profile, train, train.index[user])
    recommendations.append(recommended)

print(recommendations)

cx:il0sdznsjgg9uxgy:3bi2ksost85yi
cx:ijtjdxpz93t5f8m5:2iz9n4nuh22ky
cx:ep1jmztcea6w2dtv4kax775yy:1sm7epvktvk6k
cx:i2i1zmdjyfqb4ugf:2x51wx1f20jgz
cx:ik8tj30jnl47bzl6:25hcdqs8lvpwa
cx:iehbk5kiehv57o82:3rh910bbwo2tz
cx:1almjawqb0fbo2t4dtic5ok78f:290w4prdbspe4
cx:1a14ikdig4lse1yr7x4flb13dd:fo6lrl3k2wob
cx:iebz69i5797mzlpt:2y78jwtw1dq2i
cx:i1tnz3yptk0g12ah:1nb6465z87hxq
[{'user_id': 'cx:il0sdznsjgg9uxgy:3bi2ksost85yi', 'articles': Index(['0023cf8c8637599ee493463a429f4af62817cf3a',
       '007a9c6d33d6b95488e903b551f7ee28cd6155db',
       '00f7e3503795ef0fd7934458523c9fd3f26f99ce',
       '0234696b40bc493104a87a85d0491e90df44b9da',
       '0344bf639aa0e15862c33294c06bcae9eafa352e',
       '05d316f7e6fe9f8431099908e3b8419bcc7a4c0c',
       '06d79ebeba7d9b02486b56ada761790b275995eb',
       '06e6a0fd13a28a32278a0ead6166d1e7433a1f5c',
       '0867dbb33bb90970ae48592057be34246a0124ac',
       '08a17a4aad811ae8a09fc9e1356909cd3632bc60',
       ...
       'f576e58c4f1c1a25b50c122e432080e7ec9a3aff'