## GP-TopFreq

GP-TopFreq – a combination of P-TopFreq and G-TopFreq that first uses
P-TopFreq to fill the basket, then uses G-TopFeq to fill any remaining slots.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

#%cd /content/drive/MyDrive/recsys

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os
os.chdir("/content/drive/MyDrive/recsys")
!ls

checkpoints  metrics.py  __pycache__	      recanet_model.py
data	     preprocess  recanet_datasets.py


In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.2)

from IPython.display import clear_output
%matplotlib inline

import pandas as pd
import plotly.express as px
import numpy as np
from tqdm import tqdm

tqdm.pandas()

from sklearn.metrics import accuracy_score, roc_auc_score

## Dataloader+dataset

In [8]:
history_len=20
item_embed_size=128
user_embed_size=32

h1 = 128
h2 = 128
h3 = 128
h4 = 128
h5 = 128

dataset_name = 'dunnhumby_cj'

In [9]:
from metrics import recall_k, ndcg_k, repeat_score_item, repeat_score_user
from recanet_datasets import PreDataset, RCNDataset, DeviceDataLoader

In [12]:
path_train = f'data/{dataset_name}/baskets/train_baskets.csv'
path_test = f'data/{dataset_name}/baskets/test_baskets.csv'
path_val = f'data/{dataset_name}/baskets/valid_baskets.csv'

In [13]:
dataset = PreDataset(path_train,path_val, path_test, dataset=dataset_name, history_len=history_len,basket_count_min=3, min_item_count = 5)

number of test users: 2483
items: 91764
filtered items: 36963


In [14]:
train = pd.read_csv(path_train)
test_users = dataset.test_users

In [36]:
max_bucket_len = max(train.groupby('basket_id').item_id.count())
max_num_baskets = max(train.groupby('user_id').basket_id.nunique())

In [72]:
def gp_topfreq(train, test_users, n=max_num_baskets):
    result = {}
    top_popular = train.item_id.value_counts().index.tolist()[:max_bucket_len]
    train = train.sort_values(by='date')
    
    for user in tqdm(test_users):
        items = train[train.user_id==user]
        dates = items.date.unique()[-n:]
        top_personal = items[items.date.isin(dates)].item_id.value_counts().index.tolist()[:max_bucket_len]
        
        if len(top_personal)<max_bucket_len:
            gp_top = (top_personal + top_popular)[:max_bucket_len]
            result[user] = gp_top
        else:
            result[user] = top_personal
        
    return result

In [75]:
res = gp_topfreq(train, test_users, n=50)

  0%|          | 0/2483 [00:00<?, ?it/s]

In [76]:
test_baskets = pd.read_csv(path_test)
user_test_baskets_df = test_baskets.groupby('user_id')['item_id'].apply(list).reset_index()
user_test_baskets_dict = dict(zip( user_test_baskets_df['user_id'],user_test_baskets_df['item_id']))

user_predictions = res
final_users = set(dataset.test_users).intersection(set(list(user_test_baskets_dict.keys())))
print('predictions ready',len(user_predictions))
print('number of final test users:',len(final_users))
for k in [5,10,20,'B']:
    print(k)
    recall_scores = {}
    ndcg_scores = {}
    #zero = 0
    for user in final_users:

        top_items = []
        if user in user_predictions:
            top_items = user_predictions[user]
        else:
            zero+=1

        if k == 'B':
            recall_scores[user] = recall_k(user_test_baskets_dict[user],top_items,len(user_test_baskets_dict[user]))
            ndcg_scores[user] = ndcg_k(user_test_baskets_dict[user],top_items,len(user_test_baskets_dict[user]))
        else:
            recall_scores[user] = recall_k(user_test_baskets_dict[user],top_items,k)
            ndcg_scores[user] = ndcg_k(user_test_baskets_dict[user],top_items,k)
    #print(zero)
    print('recall:',np.mean(list(recall_scores.values())))
    print('ndcg:',np.mean(list(ndcg_scores.values())))

predictions ready 2483
number of final test users: 1243
5
recall: 0.1122471587550507
ndcg: 0.1710109936246384
10
recall: 0.14732749868705472
ndcg: 0.14012473594393066
20
recall: 0.20017823183717073
ndcg: 0.11050241486255191
B
recall: 0.12149068839735666
ndcg: 0.13985002689423645
