In [1]:
import os
import shutil
import pandas as pd
import numpy as np

import cudf
import cupy
import nvtabular as nvt
import transformers4rec as tr
from transformers4rec import preprocess as pp

In [2]:
DATA_FOLDER = "/romeyn/data/yoochoose/"
FILENAME_PATTERN = 'yoochoose-clicks.dat'
DATA_PATH = os.path.join(DATA_FOLDER, FILENAME_PATTERN)

OUTPUT_FOLDER = "/romeyn/data/yoochoose_transformed"
OVERWRITE = False
MINIMUM_SESSION_LENGTH = 2

### Loading data

In [3]:
interactions_df = cudf.read_csv(DATA_PATH, sep=',', 
                                names=['session_id','timestamp', 'item_id', 'category'], 
                                parse_dates=['timestamp'])
interactions_df = pp.remove_consecutive_interactions(interactions_df)
items_first_ts_df = interactions_df.groupby('item_id').agg({'timestamp': 'min'}).reset_index().rename(columns={'timestamp': 'itemid_ts_first'})
interactions_merged_df = interactions_df.merge(items_first_ts_df, on=['item_id'], how='left')

interactions_merged_df.head()

Count with in-session repeated interactions: 33003944
Count after removed in-session repeated interactions: 28971543


Unnamed: 0,session_id,timestamp,item_id,category,itemid_ts_first
0,623,2014-04-06 20:07:04.303,214839373,0,2014-04-01 03:41:31.325
1,624,2014-04-06 12:40:39.889,214637025,0,2014-04-02 17:40:22.149
2,624,2014-04-06 12:44:12.485,214636355,0,2014-04-02 15:17:18.195
3,626,2014-04-07 19:11:17.562,214827007,0,2014-04-01 10:36:38.889
4,626,2014-04-07 19:11:47.407,214826925,0,2014-04-03 07:10:23.818


### Transformations

In [4]:
LogNormalize = pp.Ops(nvt.ops.LogOp(), nvt.ops.Normalize(), auto_renaming=True)

In [5]:
features = ['session_id', 'timestamp']

# Temporal features
features += LogNormalize(["timestamp"] >> pp.ItemRecency("itemid_ts_first"), add=True)
features += pp.TimestampFeatures(add_cycled=True, delimiter="/")(["timestamp"])

# Categorical features
categorical = ['item_id', 'category'] >> nvt.ops.Categorify()
features += categorical

# Group-by session
session_features = features >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    sort_cols=["ts"],
    aggs=pp.create_session_aggs(features, extra_aggs=dict(item_id="count", ts=["first", "last"], timestamp="first"), to_ignore=["timestamp"]),
    name_sep="/"
)
rename_cols = {"item_id/count": "session_size"} 
session_features = session_features >> nvt.ops.Rename(lambda col: rename_cols.get(col, col))
session_features += pp.SessionDay()(session_features.filter("timestamp/first"))

filtered_sessions = session_features >> nvt.ops.Filter(f=lambda df: df["session_size"] >= MINIMUM_SESSION_LENGTH)

In [6]:
if not os.path.exists(OUTPUT_FOLDER) or OVERWRITE:
    workflow = nvt.Workflow(filtered_sessions)
    dataset = nvt.Dataset(interactions_merged_df, cpu=False)
    workflow.fit(dataset)
    # new_gdf = workflow.transform(dataset).to_ddf().compute()
    pp.save_time_based_splits(workflow.transform(dataset), OUTPUT_FOLDER)
    workflow.save(OUTPUT_FOLDER)
else:
    workflow = nvt.Workflow.load(OUTPUT_FOLDER)
    
workflow.column_group.columns

['item_id/list',
 'timestamp/hour_cos/list',
 'ts/first',
 'timestamp/first',
 'timestamp/year/list',
 'timestamp/month/list',
 'timestamp/hour/list',
 'timestamp/age_days/list',
 'ts/list',
 'ts/last',
 'timestamp/weekday_cos/list',
 'timestamp/day/list',
 'timestamp/weekday_sin/list',
 'session_id',
 'timestamp/age_days/LogOp/Normalize/list',
 'session_size',
 'timestamp/hour_sin/list',
 'category/list',
 'timestamp/weekday/list',
 'day_idx']

### Train model

In [None]:
data_args = tr.DataArguments(
    data_path=OUTPUT_FOLDER, 
    feature_config=workflow,
    label_name="item_id/list",
    start_time_window_index=1,
    final_time_window_index=15,
    time_window_folder_pad_digits=4
)
model_args = tr.ModelArguments()
training_args = tr.TrainingArguments("/tmp/", log_attention_weights=True)

rec_model, trainer = tr.get_model_and_trainer(model_args, data_args, training_args)

trainer.reset_lr_scheduler()
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mromeyn[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.31 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Step,Training Loss
