In [1]:
import tensorflow as tf
import pandas as pd
from collections import defaultdict
from pathlib import Path

import tophat.callbacks as cbks
from tophat.data import FeatureSource, InteractionsSource
from tophat.constants import FType, FGroup
from tophat.tasks.wrapper import FactorizationTaskWrapper
from tophat.core import TophatModel
from tophat.evaluation import Validator

DATA_DIR = Path('../../data/retail-rocket') / 'saved'
LOG_DIR = '/dev/null'

EMB_DIM = 30
BATCH_SIZE = 128
N_EPOCHS = 20

# Interactions

In [2]:
events_tsplit = InteractionsSource(
    path=DATA_DIR/'events_tsplit.msg',
    user_col='visitorid',
    item_col='itemid',
    activity_col='event',
    activity_filter_set={'addtocart', 'transaction'},
    load_fn=pd.read_msgpack,
    name='events_tsplit',
)

events_vsplit = InteractionsSource(
    path=DATA_DIR/'events_vsplit.msg',
    user_col='visitorid',
    item_col='itemid',
    activity_col='event',
    activity_filter_set={'transaction'},
    load_fn=pd.read_msgpack,
    name='events_vsplit',
)

# Content Features

In [3]:
taxon_feats = FeatureSource(
    name='taxonomy',
    path=DATA_DIR/'item_categories.msg',
    feature_type=FType.CAT,
    index_col='itemid',
    concat_cols=[('categoryid_lvl2', 'categoryid_lvl3')],
    drop_cols=['categoryid_lvl3', 'categoryid_lvl4', 'categoryid_lvl5'],
    load_fn=pd.read_msgpack,
)

In [4]:
features = {
    'user': [],
    'item': [taxon_feats],
}

# Tasks
We will create 2 independent tasks. One trained using BPR and another using WARP. Because we want each to have their own set of learned embeddings, we encapsulate in a `tf.variable_scope`

## BPR

In [5]:
with tf.variable_scope("bpr"):
    task_bpr = FactorizationTaskWrapper(
        loss_fn='bpr',
        sample_method='uniform_verified',
        interactions=events_tsplit,
        group_features=features,
        embedding_map_kwargs={'embedding_dim': EMB_DIM},
        batch_size=BATCH_SIZE,
        name='bpr',
    )


INFO:root:Filtering on event in {'transaction', 'addtocart'}
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  CategoricalDtype(existing_fgroup_cats))
INFO:root:interactions_df shape:	(77248,4)
INFO:root:user features shape:	(32690,1)
INFO:root:item_features shape:	(21093,5)


INFO:tensorflow:Scale of 0 disables regularizer.


INFO:tensorflow:Scale of 0 disables regularizer.


INFO:tensorflow:Scale of 0 disables regularizer.


INFO:tensorflow:Scale of 0 disables regularizer.


### Validators

In [6]:
primary_validator = Validator(
    events_vsplit,
    parent_task_wrapper=task_bpr,
    limit_items=-1,
    n_users_eval=-1,
    include_cold=False,
    cold_only=False,
    name='userXitem',
)

cold_validator = Validator(
    events_vsplit,
    parent_task_wrapper=task_bpr,
    limit_items=-1,
    n_users_eval=-1,
    include_cold=True,
    cold_only=True,
    features_srcs=features,
    specific_feature=defaultdict(lambda: True),
    name='userXcolditem',
)


INFO:root:Filtering on event in {'transaction'}
INFO:root:warm interactions_df shape:	(381,4)
INFO:root:Evaluating on 21093 items
INFO:root:Already loaded
INFO:root:Already loaded
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  CategoricalDtype(existing_fgroup_cats))
INFO:root:interactions_df shape:	(2218,4)
INFO:root:user features shape:	(1213,1)
INFO:root:item_features shape:	(1703,5)
INFO:root:Evaluating on 21666 items


### Callbacks

In [7]:

summary_cb = cbks.Summary(log_dir=LOG_DIR)
val_cb = cbks.Scorer(primary_validator,
                     summary_writer=summary_cb.summary_writer,
                     freq=5,)
cold_val_cb = cbks.Scorer(cold_validator,
                          summary_writer=summary_cb.summary_writer,
                          freq=5,)
callbacks = [
    summary_cb,
    val_cb,
    cold_val_cb,
]


In [8]:
model_bpr = TophatModel(tasks=[task_bpr])
model_bpr.fit(N_EPOCHS, callbacks=callbacks, verbose=0)
score_hist_bpr = pd.DataFrame(val_cb.score_hists)

INFO:root:Scoring (userXitem):
100%|██████████| 50/50 [00:01<00:00, 34.95it/s]
INFO:root:(val)mapk = 0.01014627213835372 +/- 0.014307241400840342
INFO:root:(val)auc = 0.8000749945640564 +/- 0.06346800178289413
INFO:root:(val)tjurs = 0.07188010960817337 +/- 0.030427850782871246
INFO:root:(val)pm = 4.725895881652832 +/- 0.500525712966919
INFO:root:(val)nm = -0.00023694461560808122 +/- 0.08295296877622604
INFO:root:Scoring (userXcolditem):
100%|██████████| 68/68 [00:02<00:00, 28.69it/s]
INFO:root:(val)mapk = 0.0 +/- 0.0
INFO:root:(val)auc = 0.6301641464233398 +/- 0.05847698822617531
INFO:root:(val)tjurs = 0.32978567481040955 +/- 0.07504571974277496
INFO:root:(val)pm = 1.350817084312439 +/- 0.5178150534629822
INFO:root:(val)nm = 0.004955396056175232 +/- 0.0663822740316391
INFO:root:Scoring (userXitem):
100%|██████████| 50/50 [00:01<00:00, 43.34it/s]
INFO:root:(val)mapk = 0.0314435511944133 +/- 0.01886909172195286
INFO:root:(val)auc = 0.8069820404052734 +/- 0.0601748451590538
INFO:root:(val

# WARP
Again, we encapsulate in `tf.variable_scope` so we don't collide with the bpr embeddings.

In [9]:
with tf.variable_scope("warp"):
    task_warp = FactorizationTaskWrapper(
        loss_fn='kos',
        sample_method='adaptive_warp',
        interactions=events_tsplit,
        group_features=features,
        embedding_map_kwargs={'embedding_dim': EMB_DIM},
        batch_size=BATCH_SIZE,
        name='warp',
    )


INFO:root:Already loaded
INFO:root:Already loaded
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  CategoricalDtype(existing_fgroup_cats))
INFO:root:interactions_df shape:	(77248,4)
INFO:root:user features shape:	(32690,1)
INFO:root:item_features shape:	(21093,5)


INFO:tensorflow:Scale of 0 disables regularizer.


INFO:tensorflow:Scale of 0 disables regularizer.


INFO:tensorflow:Scale of 0 disables regularizer.


INFO:tensorflow:Scale of 0 disables regularizer.


### Validators - WARP

In [10]:
primary_validator = Validator(
    events_vsplit,
    parent_task_wrapper=task_warp,
    limit_items=-1,
    n_users_eval=-1,
    include_cold=False,
    cold_only=False,
    name='userXitem',
)

cold_validator = Validator(
    events_vsplit,
    parent_task_wrapper=task_warp,
    limit_items=-1,
    n_users_eval=-1,
    include_cold=True,
    cold_only=True,
    features_srcs=features,
    specific_feature=defaultdict(lambda: True),
    name='userXcolditem',
)


INFO:root:Already loaded
INFO:root:warm interactions_df shape:	(381,4)
INFO:root:Evaluating on 21093 items
INFO:root:Already loaded
INFO:root:Already loaded
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  CategoricalDtype(existing_fgroup_cats))
INFO:root:interactions_df shape:	(2218,4)
INFO:root:user features shape:	(1213,1)
INFO:root:item_features shape:	(1703,5)
INFO:root:Evaluating on 21666 items


### Callbacks - WARP

In [11]:
summary_cb = cbks.Summary(log_dir=LOG_DIR)
val_cb = cbks.Scorer(primary_validator,
                     summary_writer=summary_cb.summary_writer,
                     freq=5,)
cold_val_cb = cbks.Scorer(cold_validator,
                          summary_writer=summary_cb.summary_writer,
                          freq=5,)
callbacks = [
    summary_cb,
    val_cb,
    cold_val_cb,
]

In [12]:
model_warp = TophatModel(tasks=[task_warp])
model_warp.fit(N_EPOCHS, callbacks=callbacks, verbose=0)
score_hist_warp = pd.DataFrame(val_cb.score_hists)

INFO:root:Scoring (userXitem):
100%|██████████| 50/50 [00:01<00:00, 34.37it/s]
INFO:root:(val)mapk = 0.06480414394379079 +/- 0.028669130089191825
INFO:root:(val)auc = 0.8409512042999268 +/- 0.05230003595352173
INFO:root:(val)tjurs = 0.1576167792081833 +/- 0.027170870453119278
INFO:root:(val)pm = 2.129396677017212 +/- 0.1612854301929474
INFO:root:(val)nm = -0.07475711405277252 +/- 0.022180631756782532
INFO:root:Scoring (userXcolditem):
100%|██████████| 68/68 [00:02<00:00, 29.75it/s]
INFO:root:(val)mapk = 0.0010069442392264991 +/- 0.0014847102808881586
INFO:root:(val)auc = 0.6531161069869995 +/- 0.051502589136362076
INFO:root:(val)tjurs = 0.3437824249267578 +/- 0.054778192192316055
INFO:root:(val)pm = 0.740329384803772 +/- 0.24799583852291107
INFO:root:(val)nm = -0.09676805138587952 +/- 0.033008329570293427
INFO:root:Scoring (userXitem):
100%|██████████| 50/50 [00:01<00:00, 37.74it/s]
INFO:root:(val)mapk = 0.10291136709799278 +/- 0.05239266177958909
INFO:root:(val)auc = 0.842957019805908