### Global params

In [1]:
%load_ext autoreload
import pandas as pd
k = 2   # Number of closest-users
n = 8   # Number of articles we recommend

### Pre-process

In [3]:
%%time
%autoreload 2
from ratingmatrix import rating_matrix
from user_keywords import load_content_data, load_events_data, merge_on_docID, get_user_counts, get_user_profile

df_content = load_content_data('data/content_refine')
df_content = df_content.rename(columns={"id": "documentId"})
# TODO: using num_days for testing
df_events = load_events_data('data/active1000', num_days=None)
df_events = df_events.drop(['title', 'publishtime'], axis=1)
df_merged = merge_on_docID(df_events, df_content)
df_user_frequencies = get_user_counts(df_merged)
df_user_profile = get_user_profile(df_user_frequencies)
df_user_profile.to_pickle('data/matrix-df/user_profiles.pkl')
df_user_item = rating_matrix(df_events)
df_user_item.to_pickle('data/matrix-df/user_item.pkl')

Wall time: 1min 12s


### Load preprocessed matrices + train_test

In [4]:
%autoreload 2
from ratingmatrix import rating_matrix_train_test_split

df_user_profile = pd.read_pickle('data/matrix-df/user_profiles.pkl')
df_user_profile.head()

df_user_item = pd.read_pickle('data/matrix-df/user_item.pkl')
df_user_item.head()

index = df_user_item.index
columns = df_user_item.columns
train, test = rating_matrix_train_test_split(df_user_item.to_numpy(), fraction=0.2)

# Train and test are disjoint (in terms of the 1-values)
train = pd.DataFrame(data=train, index=index, columns=columns)
test = pd.DataFrame(data=test, index=index, columns=columns)

## Method


In [15]:
%%time
from recommender_methods import collaborative_filtering_user_based

# List with users and their recommended articles
recommendations = []

# The number of requests for recommendations
num_recommendations = 0

# Returns a list with recommended articles
for user in range(100):
    num_recommendations += 1
    recommended = collaborative_filtering_user_based(df_user_profile, train, train.index[user], k, n)
    recommendations.append(recommended)

predict = test.copy(deep=True)
predict[:] = 0.0
for user in recommendations:
    for article in user['articles']:
        predict.at[user['user_id'], article] = 1.0

predict.head()


Index(['004edd2345395b2b798b941e8b92ed9c550a749a',
       '005acce48b90f324748a3a92cbb33bd24bbfc622',
       '0066e790f51f2aeee04283fedc25869c4071e045',
       '00696652959c6bccb89750134333e25bc554f8e0',
       '00b8cc9941163db58d9502635a6b8b1230834fd0',
       '00e4d5f6a2005aea6d631b64bfaf3e290f01f597',
       '0158d19a8b9b0ab3fd273ee9fbc3caf361f4cd34',
       '0167289ea75706065201eee42d067d92663bcf8d'],
      dtype='object')
Index(['0006b2caef9ae801cfa0a48ec737544241ec4dca',
       '0023cf8c8637599ee493463a429f4af62817cf3a',
       '005acce48b90f324748a3a92cbb33bd24bbfc622',
       '009e6759a188566730262cd31de5932cb259de0d',
       '00d3e23239e37e676bc3479d114938214911570a',
       '00e4d5f6a2005aea6d631b64bfaf3e290f01f597',
       '012d166cad29fea07346dca965d0acb2909cb978',
       '016186e8e4bfc383165e5059c6211473d28cfa06'],
      dtype='object')
Index(['007f9e3194ee7202d34093cee81767519400a8b4',
       '00e4d5f6a2005aea6d631b64bfaf3e290f01f597',
       '01bc4cba7f428111ef1bacd8db4a

KeyboardInterrupt: 

In [7]:
from evaluation import evaluate
evaluate(recommendations, test, k, num_recommendations)

Recall@2 is 0.0010
ARHR@2 is 0.0000
CTR@2 is 1.0%
