### Global params

In [86]:
%load_ext autoreload
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Pre-process

In [87]:
%%time
%autoreload 2
from ratingmatrix import rating_matrix
from user_keywords import load_content_data, load_events_data, merge_on_docID, get_user_counts, get_user_profile

df_content = load_content_data('data/content_refine')
df_content = df_content.rename(columns={"id": "documentId"})
# TODO: using num_days for testing
df_events = load_events_data('data/active1000', num_days=1)
df_events = df_events.drop(['title', 'publishtime'], axis=1)
df_merged = merge_on_docID(df_events, df_content)
df_user_frequencies = get_user_counts(df_merged)
df_user_profile = get_user_profile(df_user_frequencies)
df_user_profile.to_pickle('data/matrix-df/user_profiles.pkl')
df_user_item = rating_matrix(df_events)
df_user_item.to_pickle('data/matrix-df/user_item.pkl')




Wall time: 20.2 s


### Load preprocessed matrices + train_test

In [88]:
%autoreload 2
from ratingmatrix import rating_matrix_train_test_split

df_user_profile = pd.read_pickle('data/matrix-df/user_profiles.pkl')
df_user_profile.head()

df_user_item = pd.read_pickle('data/matrix-df/user_item.pkl')
df_user_item.head()

index = df_user_item.index
columns = df_user_item.columns
train, test = rating_matrix_train_test_split(df_user_item.to_numpy(), fraction=0.2)

# Train and test are disjoint (in terms of the 1-values)
train = pd.DataFrame(data=train, index=index, columns=columns)
test = pd.DataFrame(data=test, index=index, columns=columns)


## Method


In [93]:
from recommender_methods import collaborative_filtering_user_based

# Returns a list with recommended articles 
recommended = collaborative_filtering_user_based(df_user_profile, df_user_item, "cx:il0sdznsjgg9uxgy:3bi2ksost85yi")
print(recommended)

K CLOSEST  userId
cx:1eovabziwdszs3chxvp1uafjc3:1x98l0bfqg1oo    0.976385
cx:hxxfp6tfxyb2n748:3gf1m281mpv6g              0.956058
dtype: float64
Index(['0867dbb33bb90970ae48592057be34246a0124ac',
       '2a00c43cf84f7c433431027845505a0fdc77a55d',
       '70a19fd7c9f6827feb3eb4f3df95121664491fa7',
       '71e7c18e889b7e19352cfb63a15bd8aa32c51630',
       '9a876c7f372136b2e51b9b98ad578c773b002c94',
       '9d615dd08d92c8e9670fb72b5c78cbc6b52501c4',
       'b16b516eefb647edec256ad2f9b2c7a897b9785b',
       'b28e7c163c39941aa1cbd0b7b3a821576771f893',
       'e1f0d81ed8ccb738db28fdfaa51ad3a6b3fc2b8e'],
      dtype='object')
