# DATA PREPROCESSING and FEATURE ENGINEERING

## 1. Import Libraries and Define Data Input and Output Paths

In [1]:
import os
import shutil
import pandas as pd
import numpy as np

import cudf
import cupy
import nvtabular as nvt
from nvtabular.column import Column
from nvtabular.column_group import ColumnGroup, Tag

from preprocess import remove_consecutive_interactions, save_time_based_splits, create_session_aggs

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
DATA_FOLDER = "/workspace/yoochoose-data/"
FILENAME_PATTERN = 'yoochoose-clicks.dat'
DATA_PATH = os.path.join(DATA_FOLDER, FILENAME_PATTERN)

OUTPUT_FOLDER = "/workspace/yoochoose-data/yoochoose_transformed"
OVERWRITE = False
MINIMUM_SESSION_LENGTH = 2

In [3]:
interactions_df = cudf.read_csv(DATA_PATH, sep=',', 
                                names=['session_id','timestamp', 'item_id', 'category'], 
                                parse_dates=['timestamp'])

## 2. Load and clean raw data : 
    1- Remove repeated interactions within the same session 
    2- Create date when item was seen for the first-time  

In [4]:
interactions_df = remove_consecutive_interactions(interactions_df, session_id_col="session_id", item_id_col="item_id", timestamp_col="timestamp")
items_first_ts_df = interactions_df.groupby('item_id').agg({'timestamp': 'min'}).reset_index().rename(columns={'timestamp': 'itemid_ts_first'})
interactions_merged_df = interactions_df.merge(items_first_ts_df, on=['item_id'], how='left')
interactions_merged_df.head()

Count with in-session repeated interactions: 33003944
Count after removed in-session repeated interactions: 28971543


Unnamed: 0,session_id,timestamp,item_id,category,itemid_ts_first
0,4547,2014-04-06 15:55:47.055,214821277,0,2014-04-01 03:34:33.782
1,4547,2014-04-06 15:58:15.621,214821292,0,2014-04-01 03:44:49.530
2,4547,2014-04-06 15:59:18.978,214820383,0,2014-04-01 05:46:04.863
3,4547,2014-04-06 16:00:29.551,214821277,0,2014-04-01 03:34:33.782
4,4547,2014-04-06 16:02:41.959,214821290,0,2014-04-01 03:25:12.166


## 3. Generate session-based features: 

In [5]:
features = ColumnGroup(["session_id", "timestamp"])

# Categorical features: Tag item-id column as Target
features += [
    Column("item_id", tags=[Tag.ITEM_ID, Tag.ITEM]),
    Column("category", tags=Tag.TARGETS_MULTI_CLASS),
] >> nvt.ops.Categorify()

# create time features
sessionTime = ColumnGroup(['timestamp'])
sessionTime_timestamp = (
    sessionTime >> 
    nvt.ops.LambdaOp(lambda col: (col.astype(int) / 1e6).astype(int)) >> 
    nvt.ops.Rename(f = lambda col: "ts")
)
features+=sessionTime_timestamp


# Create session-level feature : list columns 
session_features = features >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    sort_cols=["ts"],
    aggs=create_session_aggs(features, extra_aggs=dict(item_id="count",  day_index="last", 
                                                       ts=["first", "last"], timestamp="first"), to_ignore=["timestamp"]),
    name_sep="/"
)
rename_cols = {"item_id/count": "session_size"} 
session_features = session_features >> nvt.ops.Rename(lambda col: rename_cols.get(col, col))

# Create day index 
day =  (session_features['timestamp/first'] >>   nvt.ops.LambdaOp(lambda col: (col.max() - col).dt.days + 1) >> 
    nvt.ops.Rename(f = lambda col: "day_index")
)  
session_features += day 

day_idx_padded = day >> (lambda col: col.astype(str).str.pad(4,fillchar='0')) >> nvt.ops.Rename(f = lambda col: "day_idx_padded")

# Trim sequences to first 20 items 
non_sequence_cols = [col for col in session_features.columns.names() if 'list' not in col]
groupby_features_trim = ((session_features - non_sequence_cols)) >> nvt.ops.ListSlice(20) >> nvt.ops.Rename(postfix = '_trim') 

processing = session_features + groupby_features_trim + day_idx_padded

In [6]:
workflow = nvt.Workflow(processing)
dataset = nvt.Dataset(interactions_merged_df, cpu=False)
workflow.fit(dataset)

In [7]:
workflow.column_group.columns

[Column(name='session_id', tags=['groupby_col'], properties={}),
 Column(name='timestamp/first', tags=['first'], properties={}),
 Column(name='session_size', tags=['count'], properties={}),
 Column(name='item_id/list', tags=[<DefaultTags.ITEM: ['item']>, 'categorical', <DefaultTags.ITEM_ID: ['item', 'item_id']>, 'list'], properties={}),
 Column(name='category/list', tags=['target', 'categorical', 'multi_class', 'list'], properties={}),
 Column(name='ts/last', tags=['last'], properties={}),
 Column(name='ts/first', tags=['first'], properties={}),
 Column(name='ts/list', tags=['list'], properties={}),
 Column(name='day_index', tags=None, properties={}),
 Column(name='item_id/list_trim', tags=[<DefaultTags.ITEM: ['item']>, 'categorical', <DefaultTags.ITEM_ID: ['item', 'item_id']>, 'list'], properties={}),
 Column(name='category/list_trim', tags=['target', 'categorical', 'multi_class', 'list'], properties={}),
 Column(name='ts/list_trim', tags=['list'], properties={}),
 Column(name='day_id

In [8]:
data = workflow.transform(dataset)

In [9]:
data.compute().head()

Unnamed: 0,session_id,timestamp/first,session_size,item_id/list,category/list,ts/last,ts/first,ts/list,day_index,item_id/list_trim,category/list_trim,ts/list_trim,day_idx_padded
0,1,2014-04-07 10:54:09.868,3,"[7340, 19523, 15499]","[1, 1, 1]",1396868220305,1396868049868,"[1396868049868, 1396868086998, 1396868220305]",176,"[7340, 19523, 15499]","[1, 1, 1]","[1396868049868, 1396868086998, 1396868220305]",176
1,2,2014-04-07 13:56:37.614,5,"[1968, 8039, 5178, 9410, 605]","[1, 1, 1, 1, 1]",1396879356889,1396878997614,"[1396878997614, 1396879117446, 1396879190710, ...",176,"[1968, 8039, 5178, 9410, 605]","[1, 1, 1, 1, 1]","[1396878997614, 1396879117446, 1396879190710, ...",176
2,3,2014-04-02 13:17:46.940,3,"[81, 2161, 2342]","[1, 1, 1]",1396445412318,1396444666940,"[1396444666940, 1396445162515, 1396445412318]",181,"[81, 2161, 2342]","[1, 1, 1]","[1396444666940, 1396445162515, 1396445412318]",181
3,4,2014-04-07 12:09:10.948,2,"[2463, 5660]","[1, 1]",1396873585416,1396872550948,"[1396872550948, 1396873585416]",176,"[2463, 5660]","[1, 1]","[1396872550948, 1396873585416]",176
4,6,2014-04-06 16:58:20.848,2,"[3212, 206]","[1, 1]",1396803746976,1396803500848,"[1396803500848, 1396803746976]",177,"[3212, 206]","[1, 1]","[1396803500848, 1396803746976]",177


## 4. Save NVTabular Workflow and Processed Data

In [10]:
workflow.save(OUTPUT_FOLDER)

- A protobuf schema is generated to save ColumnGroups structure with their Tags

In [11]:
!ls $OUTPUT_FOLDER'/schema.pbtxt'

/workspace/yoochoose-data/yoochoose_transformed/schema.pbtxt


- save the data by partitioning based on the day_idx_padded

In [12]:
# save_time_based_splits(workflow.transform(dataset), OUTPUT_FOLDER, partition_col='day_idx_padded')

# MODEL BUILDING and TRAINING

## 1. Import Libraries and Methods

In [13]:
from dataclasses import dataclass
import yaml 

from torch import nn 

from recsys_data import get_nvtabular_dataloader

from feature_process import get_feature_process 
from mask_sequence import MLM, CLM, PLM, RTD, get_masking_task
from tower_model import TowerModel 
from prediction_head import ItemPrediction
from meta_model import MetaModel

from training import TrainingArguments, DataArguments, ModelArguments
warnings.filterwarnings('ignore')

## 2. Define Feature_Map

To test the outputs of Meta-model submodules, we'll consider the feature_map related to the pre-processing of yoochoose dataset :
- We can load features from three options possible: the Tagging schema / protobuf txt file / yaml.config file.

In [14]:
feature_group_configs = {'name' : 'session_based_features_itemid',
                        'feature_map' : "./datasets/session_based_features_itemid.yaml"}

## 3. Set Data Arguments 

In [15]:
DataArguments.data_path = '/workspace/yoochoose-data/yoochoose_transformed/'  
DataArguments.feature_config = feature_group_configs['feature_map'] 
DataArguments.data_loader_engine = 'nvtabular'  # also supporting petastorm and pyarrow 

## 4. Set Time-window for training 

In [16]:
DataArguments.start_time_window_index = 1
DataArguments.final_time_window_index = 3

## 5. Set Training Hyper-Parameters

In [17]:
TrainingArguments.n_gpu = 1
TrainingArguments.train_batch_size = 512 * 8
TrainingArguments.learning_rate = 1e-3
TrainingArguments.num_train_epochs = 1

## 6. Load a batch of the Processed Data¶

In [18]:
with open(feature_group_configs['feature_map']) as yaml_file:
    feature_map = yaml.load(yaml_file, Loader=yaml.FullLoader)
feature_map

{'item_id/list_trim': {'dtype': 'categorical',
  'cardinality': 52740,
  'is_seq_label': True,
  'is_itemid': True,
  'emb_table': 'item_id-list',
  'log_with_preds_as_metadata': True}}

In [19]:
data_paths = ['/workspace/yoochoose-data/yoochoose_transformed/0002/train.parquet']
loader = get_nvtabular_dataloader(DataArguments, TrainingArguments, feature_map, data_paths, TrainingArguments.train_batch_size)
it = iter(loader)
first = next(it)
first

{'item_id/list_trim': tensor([[ 6106, 12442,  7888,  ...,     0,     0,     0],
         [  156,     0,     0,  ...,     0,     0,     0],
         [  897,  5558,  6555,  ...,     0,     0,     0],
         ...,
         [ 6804,  8359, 14987,  ...,     0,     0,     0],
         [ 1452,     0,     0,  ...,     0,     0,     0],
         [ 4973,  2758,  3932,  ...,     0,     0,     0]], device='cuda:0')}

## 7. Instantiate an End-to-End Meta-Model

-  Define a meta-class for next item prediction  

In [20]:
meta_model = MetaModel(feature_group_config=[feature_group_configs], 
                       model_type='xlnet', 
                       masking_task='mlm',
                       max_seq_length=20,
                       n_head=4,
                       n_layer=2)

In [21]:
# print out the meta_model' layers.
meta_model

MetaModel(
  (feature_group): FeatureGroupProcess(
    (aggregate): SequenceAggregator(
      (aggregator): ElementwiseSum()
    )
  )
  (mask_task): MLM()
  (tower_model): TowerModel(
    (model): XLNetModel(
      (word_embedding): Embedding(1, 128)
      (layer): ModuleList(
        (0): XLNetLayer(
          (rel_attn): XLNetRelativeAttention(
            (layer_norm): LayerNorm((128,), eps=0.03, elementwise_affine=True)
            (dropout): Dropout(p=0.3, inplace=False)
          )
          (ff): XLNetFeedForward(
            (layer_norm): LayerNorm((128,), eps=0.03, elementwise_affine=True)
            (layer_1): Linear(in_features=128, out_features=512, bias=True)
            (layer_2): Linear(in_features=512, out_features=128, bias=True)
            (dropout): Dropout(p=0.3, inplace=False)
          )
          (dropout): Dropout(p=0.3, inplace=False)
        )
        (1): XLNetLayer(
          (rel_attn): XLNetRelativeAttention(
            (layer_norm): LayerNorm((128,), 

- Generate the output for the first batch

In [22]:
output = meta_model(training=True, **first)

In [23]:
output.keys()

dict_keys(['loss', 'labels', 'predictions', 'pred_metadata', 'model_outputs'])

In [24]:
output['loss']

tensor(-0.0043, device='cuda:0', grad_fn=<NllLossBackward>)

In [25]:
output['predictions']

tensor([[-0.0667,  0.0909, -0.0206,  ..., -0.0252, -0.0784,  0.0009],
        [-0.0078,  0.0419, -0.0154,  ...,  0.0145,  0.0219,  0.0413],
        [-0.0409, -0.0108, -0.0398,  ..., -0.0663,  0.0204, -0.0548],
        ...,
        [-0.0382, -0.0014,  0.0032,  ..., -0.0135,  0.0194, -0.0348],
        [ 0.0426,  0.0629,  0.0010,  ..., -0.0189,  0.0489, -0.0416],
        [-0.1488, -0.0173,  0.0454,  ..., -0.1199, -0.0235, -0.0948]],
       device='cuda:0', grad_fn=<AddmmBackward>)

## 8. Train the Model

In [26]:
# Instantiate the RecSysTrainer, which manages training and evaluation
from transformers4rec.recsys_trainer import RecSysTrainer, DatasetType
trainer = RecSysTrainer(
    model=meta_model,
    args=TrainingArguments,
    model_args=ModelArguments,
    data_args=DataArguments,
)

In [27]:
from training import fit_and_evaluate
fit_and_evaluate(trainer, start_time_window_index=1, final_time_window_index=3)



************* Training (time indices:1-3) *************




Step,Training Loss,Validation Loss


************* Evaluation *************


Evaluating on train set (time index:[1, 2, 3])....



Not all data has been set. Are you sure you passed all values?
Not all data has been set. Are you sure you passed all values?


***** train results (time index): [1, 2, 3])*****

  epoch = 1.0
  eval_mem_cpu_alloc_delta = -954368
  eval_mem_cpu_peaked_delta = 90005504
  eval_mem_gpu_alloc_delta = 0
  eval_mem_gpu_peaked_delta = 3772212224
  train_avg_precision@10 = 3.71969992758952e-05
  train_avg_precision@1000 = 0.0011366652809859563
  train_avg_precision@20 = 0.0007463916467208441
  train_loss = -0.05457285439802541
  train_ndcg@10 = 8.258292614805719e-05
  train_ndcg@1000 = 0.01546501052669353
  train_ndcg@20 = 0.0024290756967578395
  train_precision@10 = 2.441406286379788e-05
  train_precision@1000 = 0.0001207953593823024
  train_precision@20 = 0.0004523383283109676
  train_recall@10 = 0.000244140625
  train_recall@1000 = 0.12079535590277778
  train_recall@20 = 0.009046766493055556
  train_runtime = 15.5062
  train_samples_per_second = 5283.054

Evaluating on test set (time index:4)....

***** eval results (time index): 4)*****

	  epoch = 1.0
	  eval_avg_precision@10 = 3.71969992758952e-05
	  eval_avg_pre

## 9. Incremental training 

In [28]:
from training import incremental_fit_and_evaluate
incremental_fit_and_evaluate(trainer, start_time_window_index=1, final_time_window_index=3)



************* Training (time indices:1-1) *************




Step,Training Loss,Validation Loss


************* Evaluation *************


Evaluating on train set (time index:[1])....



Not all data has been set. Are you sure you passed all values?
Not all data has been set. Are you sure you passed all values?


***** train results (time index): [1])*****

  epoch = 1.0
  eval_mem_cpu_alloc_delta = 1384448
  eval_mem_cpu_peaked_delta = 84787200
  eval_mem_gpu_alloc_delta = 0
  eval_mem_gpu_peaked_delta = 3763692544
  train_avg_precision@10 = 0.015532613359391689
  train_avg_precision@1000 = 0.01710147727280855
  train_avg_precision@20 = 0.015566132962703705
  train_loss = -0.13807205557823182
  train_ndcg@10 = 0.016115390695631504
  train_ndcg@1000 = 0.048832619190216066
  train_ndcg@20 = 0.0162384457886219
  train_precision@10 = 0.0018115234561264515
  train_precision@1000 = 0.0002700683602597564
  train_precision@20 = 0.000930175743997097
  train_recall@10 = 0.018115234375
  train_recall@1000 = 0.270068359375
  train_recall@20 = 0.018603515625
  train_runtime = 3.6751
  train_samples_per_second = 22290.6

Evaluating on test set (time index:2)....

***** eval results (time index): 2)*****

	  epoch = 1.0
	  eval_avg_precision@10 = 0.015532613359391689
	  eval_avg_precision@1000 = 0.0171014772

Step,Training Loss,Validation Loss


************* Evaluation *************


Evaluating on train set (time index:[2])....



Not all data has been set. Are you sure you passed all values?
Not all data has been set. Are you sure you passed all values?


***** train results (time index): [2])*****

  epoch = 1.0
  eval_mem_cpu_alloc_delta = 5423104
  eval_mem_cpu_peaked_delta = 81014784
  eval_mem_gpu_alloc_delta = 0
  eval_mem_gpu_peaked_delta = 3761792000
  train_avg_precision@10 = 0.007889375323429704
  train_avg_precision@1000 = 0.012157281395047903
  train_avg_precision@20 = 0.010199231095612049
  train_loss = -0.2331487573683262
  train_ndcg@10 = 0.008895107661373913
  train_ndcg@1000 = 0.05885584093630314
  train_ndcg@20 = 0.016506997868418694
  train_precision@10 = 0.0011901855177711695
  train_precision@1000 = 0.00037261964462231845
  train_precision@20 = 0.002017211925704032
  train_recall@10 = 0.01190185546875
  train_recall@1000 = 0.37261962890625
  train_recall@20 = 0.04034423828125
  train_runtime = 3.2536
  train_samples_per_second = 25178.593

Evaluating on test set (time index:3)....

***** eval results (time index): 3)*****

	  epoch = 1.0
	  eval_avg_precision@10 = 0.007889375323429704
	  eval_avg_precision@1000 = 0.

Step,Training Loss,Validation Loss


************* Evaluation *************


Evaluating on train set (time index:[3])....



Not all data has been set. Are you sure you passed all values?
Not all data has been set. Are you sure you passed all values?


***** train results (time index): [3])*****

  epoch = 1.0
  eval_mem_cpu_alloc_delta = 4964352
  eval_mem_cpu_peaked_delta = 80162816
  eval_mem_gpu_alloc_delta = 0
  eval_mem_gpu_peaked_delta = 3765396480
  train_avg_precision@10 = 0.0023144918626972605
  train_avg_precision@1000 = 0.0037628881566758665
  train_avg_precision@20 = 0.0024774001046483007
  train_loss = -0.29610590849603924
  train_ndcg@10 = 0.004306757490017584
  train_ndcg@1000 = 0.03707056971532958
  train_ndcg@20 = 0.004945235992116588
  train_precision@10 = 0.0011056082855377878
  train_precision@1000 = 0.00027518135695053
  train_precision@20 = 0.0006835937633046082
  train_recall@10 = 0.011056082589285714
  train_recall@1000 = 0.27518136160714285
  train_recall@20 = 0.013671875
  train_runtime = 5.1086
  train_samples_per_second = 16035.833

Evaluating on test set (time index:4)....

***** eval results (time index): 4)*****

	  epoch = 1.0
	  eval_avg_precision@10 = 0.0023144918626972605
	  eval_avg_precision@1000

## 10. SERVING TO THE TRITON

In [29]:
# External dependencies
import os
from time import time
import warnings 

from tritonclient.utils import *
import tritonclient.grpc as grpcclient
import nvtabular
import cudf
from timeit import default_timer as timer
from datetime import timedelta
warnings.filterwarnings('ignore')

In [30]:
MODEL_PATH = os.path.join(TrainingArguments.output_dir, TrainingArguments.model_name)
INPUT_DATA_DIR = "/workspace/yoochoose-data/"

- VERIFY IF TRITON IS RUNNING CORRECTLY

In [31]:
import tritonhttpclient
try:
    triton_client = tritonhttpclient.InferenceServerClient(url="localhost:8000", verbose=True)
    print("client created.")
except Exception as e:
    print("channel creation failed: " + str(e))

client created.




- Load the trained model 

In [32]:
triton_client.load_model(model_name='yoochoose_xlnet')

model loaded


- Get prediction for batch of data : 'first'

In [33]:
from inference import get_inference
get_inference(triton_client, first) 

Computing output for input data:
	 {'item_id/list_trim': tensor([[ 6106, 12442,  7888,  ...,     0,     0,     0],
        [  156,     0,     0,  ...,     0,     0,     0],
        [  897,  5558,  6555,  ...,     0,     0,     0],
        ...,
        [ 6804,  8359, 14987,  ...,     0,     0,     0],
        [ 1452,     0,     0,  ...,     0,     0,     0],
        [ 4973,  2758,  3932,  ...,     0,     0,     0]], device='cuda:0')}
result:
 [[ 0.01365081  0.03654207  0.03806695 ... -0.00918404  0.12739913
  -0.08028503]
 [-0.05451949  0.0035963   0.03438552 ...  0.03322484  0.04080514
  -0.11493868]
 [-0.06734393  0.05595984  0.02335898 ...  0.03544397  0.04862121
  -0.08146254]
 ...
 [ 0.00092665  0.03755305  0.086696   ... -0.05103946  0.09755504
  -0.1181016 ]
 [-0.04559753  0.05380863 -0.07840014 ... -0.06347984  0.01734107
  -0.05125437]
 [-0.07515392 -0.04484848 -0.00732597 ...  0.05014362 -0.00692284
  -0.0051174 ]]
