# DATA PREPROCESSING and FEATURE ENGINEERING

## 1. Import Libraries and Define Data Input and Output Paths

In [1]:
import os
import shutil
import pandas as pd
import numpy as np

import cudf
import cupy
import nvtabular as nvt
from nvtabular.column import Column
from nvtabular.column_group import ColumnGroup, Tag

from preprocess import remove_consecutive_interactions, save_time_based_splits, create_session_aggs

import warnings
warnings.filterwarnings('ignore')

In [2]:
DATA_FOLDER = "/workspace/yoochoose-data/"
FILENAME_PATTERN = 'yoochoose-clicks.dat'
DATA_PATH = os.path.join(DATA_FOLDER, FILENAME_PATTERN)

OUTPUT_FOLDER = "/workspace/yoochoose-data/yoochoose_transformed"
OVERWRITE = False
MINIMUM_SESSION_LENGTH = 2

In [3]:
interactions_df = cudf.read_csv(DATA_PATH, sep=',', 
                                names=['session_id','timestamp', 'item_id', 'category'], 
                                parse_dates=['timestamp'])

## 2. Load and clean raw data : 
    1- Remove repeated interactions within the same session 
    2- Create date when item was seen for the first-time  

In [4]:
interactions_df = remove_consecutive_interactions(interactions_df)
items_first_ts_df = interactions_df.groupby('item_id').agg({'timestamp': 'min'}).reset_index().rename(columns={'timestamp': 'itemid_ts_first'})
interactions_merged_df = interactions_df.merge(items_first_ts_df, on=['item_id'], how='left')
interactions_merged_df.head()

Count with in-session repeated interactions: 33003944
Count after removed in-session repeated interactions: 28971543


Unnamed: 0,session_id,timestamp,item_id,category,itemid_ts_first
0,1626,2014-04-02 20:38:38.904,214644313,0,2014-04-01 06:47:53.138
1,1626,2014-04-02 20:39:17.414,214821011,0,2014-04-01 03:30:29.541
2,1626,2014-04-02 20:40:06.533,214835445,0,2014-04-01 09:36:04.606
3,1627,2014-04-05 23:09:14.658,214826844,0,2014-04-01 03:28:33.582
4,1628,2014-04-02 21:13:37.579,214819552,0,2014-04-01 03:08:47.161


## 3. Generate session-based features: 

In [5]:
features = ColumnGroup(["session_id", "timestamp"])

# Categorical features: Tag item-id column as Target
features += [
    Column("item_id", tags=[Tag.ITEM_ID]),
    Column("category", tags=Tag.ITEM),
] >> nvt.ops.Categorify()

# create time features
sessionTime = ColumnGroup(['timestamp'])
sessionTime_timestamp = (
    sessionTime >> 
    nvt.ops.LambdaOp(lambda col: (col.astype(int) / 1e6).astype(int)) >> 
    nvt.ops.Rename(f = lambda col: "ts")
)
features+=sessionTime_timestamp


# Create session-level feature : list columns 
session_features = features >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    sort_cols=["ts"],
    aggs=create_session_aggs(features, extra_aggs=dict(item_id="count",  day_index="last", 
                                                       ts=["first", "last"], timestamp="first"), to_ignore=["timestamp"]),
    name_sep="/"
)
rename_cols = {"item_id/count": "session_size"} 
session_features = session_features >> nvt.ops.Rename(lambda col: rename_cols.get(col, col))

# Create day index 
day =  (session_features['timestamp/first'] >>   nvt.ops.LambdaOp(lambda col: (col.max() - col).dt.days + 1) >> 
    nvt.ops.Rename(f = lambda col: "day_index")
)  
session_features += day 

day_idx_padded = day >> (lambda col: col.astype(str).str.pad(4,fillchar='0')) >> nvt.ops.Rename(f = lambda col: "day_idx_padded")

# Trim sequences to first 20 items 
non_sequence_cols = [col for col in session_features.columns.names() if 'list' not in col]
groupby_features_trim = ((session_features - non_sequence_cols)) >> nvt.ops.ListSlice(20) >> nvt.ops.Rename(postfix = '_trim') 

processing = session_features + groupby_features_trim + day_idx_padded

In [6]:
session_features.columns

[Column(name='session_id', tags=['groupby_col'], properties={}),
 Column(name='timestamp/first', tags=['first'], properties={}),
 Column(name='session_size', tags=['count'], properties={}),
 Column(name='item_id/list', tags=[<DefaultTags.ITEM_ID: ['item', 'item_id']>, 'list', 'categorical'], properties={}),
 Column(name='category/list', tags=['item', 'list', 'categorical'], properties={}),
 Column(name='ts/first', tags=['first'], properties={}),
 Column(name='ts/list', tags=['list'], properties={}),
 Column(name='ts/last', tags=['last'], properties={}),
 Column(name='day_index', tags=None, properties={})]

In [7]:
workflow = nvt.Workflow(processing)
dataset = nvt.Dataset(interactions_merged_df, cpu=False)
workflow.fit(dataset)

In [8]:
workflow.column_group.columns

[Column(name='session_id', tags=['groupby_col'], properties={}),
 Column(name='timestamp/first', tags=['first'], properties={}),
 Column(name='session_size', tags=['count'], properties={}),
 Column(name='item_id/list', tags=[<DefaultTags.ITEM_ID: ['item', 'item_id']>, 'list', 'categorical'], properties={}),
 Column(name='category/list', tags=['item', 'list', 'categorical'], properties={}),
 Column(name='ts/first', tags=['first'], properties={}),
 Column(name='ts/list', tags=['list'], properties={}),
 Column(name='ts/last', tags=['last'], properties={}),
 Column(name='day_index', tags=None, properties={}),
 Column(name='item_id/list_trim', tags=[<DefaultTags.ITEM_ID: ['item', 'item_id']>, 'list', 'categorical'], properties={}),
 Column(name='category/list_trim', tags=['item', 'list', 'categorical'], properties={}),
 Column(name='ts/list_trim', tags=['list'], properties={}),
 Column(name='day_idx_padded', tags=None, properties={})]

In [9]:
data = workflow.transform(dataset)

In [10]:
data.compute().head()

Unnamed: 0,session_id,timestamp/first,session_size,item_id/list,category/list,ts/first,ts/list,ts/last,day_index,item_id/list_trim,category/list_trim,ts/list_trim,day_idx_padded
0,1,2014-04-07 10:54:09.868,3,"[7340, 19523, 15499]","[1, 1, 1]",1396868049868,"[1396868049868, 1396868086998, 1396868220305]",1396868220305,176,"[7340, 19523, 15499]","[1, 1, 1]","[1396868049868, 1396868086998, 1396868220305]",176
1,2,2014-04-07 13:56:37.614,5,"[1968, 8039, 5178, 9410, 605]","[1, 1, 1, 1, 1]",1396878997614,"[1396878997614, 1396879117446, 1396879190710, ...",1396879356889,176,"[1968, 8039, 5178, 9410, 605]","[1, 1, 1, 1, 1]","[1396878997614, 1396879117446, 1396879190710, ...",176
2,3,2014-04-02 13:17:46.940,3,"[81, 2161, 2342]","[1, 1, 1]",1396444666940,"[1396444666940, 1396445162515, 1396445412318]",1396445412318,181,"[81, 2161, 2342]","[1, 1, 1]","[1396444666940, 1396445162515, 1396445412318]",181
3,4,2014-04-07 12:09:10.948,2,"[2463, 5660]","[1, 1]",1396872550948,"[1396872550948, 1396873585416]",1396873585416,176,"[2463, 5660]","[1, 1]","[1396872550948, 1396873585416]",176
4,6,2014-04-06 16:58:20.848,2,"[3212, 206]","[1, 1]",1396803500848,"[1396803500848, 1396803746976]",1396803746976,177,"[3212, 206]","[1, 1]","[1396803500848, 1396803746976]",177


## 4. Save NVTabular Workflow and Processed Data

In [11]:
workflow.save(OUTPUT_FOLDER)

- save the data by partitioning based on the day_idx_padded

In [12]:
# save_time_based_splits(workflow.transform(dataset), OUTPUT_FOLDER, partition_col='day_idx_padded')

# MODEL BUILDING and TRAINING

## 1. Import Libraries and Methods

In [13]:
from dataclasses import dataclass
import yaml 

from torch import nn 

from recsys_data import get_nvtabular_dataloader

from feature_process import get_feature_process 
from mask_sequence import MLM, CLM, PLM, RTD, get_masking_task
from tower_model import TowerModel 
from prediction_head import ItemPrediction
from meta_model import MetaModel

from training import TrainingArguments, DataArguments, ModelArguments

## 2. Define Feature_Map

To test the outputs of Meta-model submodules, we'll consider the feature_map related to the pre-processing of yoochoose dataset :

In [14]:
feature_group_configs = {'name' : 'session_based_features_itemid',
                        'feature_map' : "./datasets/session_based_features_itemid.yaml"}

## 3. Set Data Arguments 

In [15]:
DataArguments.data_path = '/workspace/yoochoose-data/yoochoose_transformed/'  
DataArguments.feature_config = feature_group_configs['feature_map'] 
DataArguments.data_loader_engine = 'nvtabular'  # also supporting petastorm and pyarrow 

## 4. Set Time-window for training 

In [16]:
DataArguments.start_time_window_index = 1
DataArguments.final_time_window_index = 3

## 5. Set Training Hyper-Parameters

In [17]:
TrainingArguments.n_gpu = 1
TrainingArguments.train_batch_size = 512 * 8
TrainingArguments.learning_rate = 1e-3
TrainingArguments.num_train_epochs = 1

## 6. Load a batch of the Processed Data¶

In [18]:
with open(feature_group_configs['feature_map']) as yaml_file:
    feature_map = yaml.load(yaml_file, Loader=yaml.FullLoader)
feature_map

{'item_id/list_trim': {'dtype': 'categorical',
  'cardinality': 52740,
  'is_seq_label': True,
  'is_itemid': True,
  'emb_table': 'item_id-list',
  'log_with_preds_as_metadata': True}}

In [19]:
data_paths = ['/workspace/yoochoose-data/yoochoose_transformed/0002/train.parquet']
loader = get_nvtabular_dataloader(DataArguments, TrainingArguments, feature_map, data_paths, TrainingArguments.train_batch_size)
it = iter(loader)
first = next(it)
first

{'item_id/list_trim': tensor([[ 6106, 12442,  7888,  ...,     0,     0,     0],
         [  156,     0,     0,  ...,     0,     0,     0],
         [  897,  5558,  6555,  ...,     0,     0,     0],
         ...,
         [ 6804,  8359, 14987,  ...,     0,     0,     0],
         [ 1452,     0,     0,  ...,     0,     0,     0],
         [ 4973,  2758,  3932,  ...,     0,     0,     0]], device='cuda:0')}

## 7. Instantiate an End-to-End Meta-Model

-  Define a meta-class for next item prediction  

In [20]:
meta_model = MetaModel(feature_group_config=[feature_group_configs], 
                       model_type='xlnet', 
                       masking_task='mlm',
                       max_seq_length=20,
                       n_head=4,
                       n_layer=2)

In [21]:
# print out the meta_model' layers.
meta_model

MetaModel(
  (feature_group): FeatureGroupProcess(
    (aggregate): SequenceAggregator(
      (aggregator): ElementwiseSum()
    )
  )
  (mask_task): MLM()
  (tower_model): TowerModel(
    (model): XLNetModel(
      (word_embedding): Embedding(1, 128)
      (layer): ModuleList(
        (0): XLNetLayer(
          (rel_attn): XLNetRelativeAttention(
            (layer_norm): LayerNorm((128,), eps=0.03, elementwise_affine=True)
            (dropout): Dropout(p=0.3, inplace=False)
          )
          (ff): XLNetFeedForward(
            (layer_norm): LayerNorm((128,), eps=0.03, elementwise_affine=True)
            (layer_1): Linear(in_features=128, out_features=512, bias=True)
            (layer_2): Linear(in_features=512, out_features=128, bias=True)
            (dropout): Dropout(p=0.3, inplace=False)
          )
          (dropout): Dropout(p=0.3, inplace=False)
        )
        (1): XLNetLayer(
          (rel_attn): XLNetRelativeAttention(
            (layer_norm): LayerNorm((128,), 

- Generate the output for the first batch

In [22]:
output = meta_model(training=True, **first)

In [23]:
output.keys()

dict_keys(['loss', 'labels', 'predictions', 'pred_metadata', 'model_outputs'])

In [24]:
output['loss']

tensor(-0.0002, device='cuda:0', grad_fn=<NllLossBackward>)

In [25]:
output['predictions']

tensor([[ 0.0817, -0.0933,  0.0447,  ..., -0.0388,  0.0132, -0.0351],
        [-0.0051, -0.0956,  0.0231,  ..., -0.0080, -0.0022,  0.0210],
        [ 0.0709, -0.1023,  0.0860,  ..., -0.0767, -0.0267, -0.0515],
        ...,
        [ 0.1012, -0.1536,  0.0316,  ..., -0.1368,  0.0007, -0.0606],
        [ 0.1283, -0.0041,  0.0400,  ..., -0.1228,  0.0025, -0.0185],
        [ 0.0448, -0.0978, -0.0368,  ...,  0.0091,  0.0891, -0.0386]],
       device='cuda:0', grad_fn=<AddmmBackward>)

## 8. Train the Model

In [26]:
# Instantiate the RecSysTrainer, which manages training and evaluation
from transformers4rec.recsys_trainer import RecSysTrainer, DatasetType
trainer = RecSysTrainer(
    model=meta_model,
    args=TrainingArguments,
    model_args=ModelArguments,
    data_args=DataArguments,
)

In [27]:
from training import fit
fit(trainer, start_time_window_index=1, final_time_window_index=3)



************* Training (time indices:1-3) *************




Step,Training Loss,Validation Loss


************* Evaluation *************


Evaluating on train set (time index:[1, 2, 3])....



Not all data has been set. Are you sure you passed all values?
Not all data has been set. Are you sure you passed all values?


***** train results (time index): [1, 2, 3])*****

  epoch = 1.0
  eval_mem_cpu_alloc_delta = 4022272
  eval_mem_cpu_peaked_delta = 85684224
  eval_mem_gpu_alloc_delta = 0
  eval_mem_gpu_peaked_delta = 3772212224
  train_avg_precision@10 = 0.00013894917153973237
  train_avg_precision@1000 = 0.0012684272175344329
  train_avg_precision@20 = 0.0007196772235652639
  train_loss = -0.054798177546925016
  train_ndcg@10 = 0.0002502151894279652
  train_ndcg@1000 = 0.012319817362974087
  train_ndcg@20 = 0.0025499717772213947
  train_precision@10 = 6.103515766476953e-05
  train_precision@1000 = 8.676486565188195e-05
  train_precision@20 = 0.0005059136318676691
  train_recall@10 = 0.0006103515625
  train_recall@1000 = 0.0867648654513889
  train_recall@20 = 0.010118272569444444
  train_runtime = 14.5612
  train_samples_per_second = 5625.901

Evaluating on test set (time index:4)....

***** eval results (time index): 4)*****

	  epoch = 1.0
	  eval_avg_precision@10 = 0.00013894917153973237
	  eval_a

## 9. Incremental training 

In [28]:
from training import incremental_fit
incremental_fit(trainer, start_time_window_index=1, final_time_window_index=3)



************* Training (time indices:1-1) *************




Step,Training Loss,Validation Loss


************* Evaluation *************


Evaluating on train set (time index:[1])....



Not all data has been set. Are you sure you passed all values?
Not all data has been set. Are you sure you passed all values?


***** train results (time index): [1])*****

  epoch = 1.0
  eval_mem_cpu_alloc_delta = 909312
  eval_mem_cpu_peaked_delta = 83886080
  eval_mem_gpu_alloc_delta = 0
  eval_mem_gpu_peaked_delta = 3763692544
  train_avg_precision@10 = 0.019070037454366685
  train_avg_precision@1000 = 0.02038167156279087
  train_avg_precision@20 = 0.019387409649789335
  train_loss = -0.13771139979362487
  train_ndcg@10 = 0.01982517819851637
  train_ndcg@1000 = 0.04873221516609192
  train_ndcg@20 = 0.02114473097026348
  train_precision@10 = 0.002250976557843387
  train_precision@1000 = 0.00025708007742650805
  train_precision@20 = 0.0014038086286745966
  train_recall@10 = 0.022509765625
  train_recall@1000 = 0.257080078125
  train_recall@20 = 0.028076171875
  train_runtime = 3.4692
  train_samples_per_second = 23613.311

Evaluating on test set (time index:2)....

***** eval results (time index): 2)*****

	  epoch = 1.0
	  eval_avg_precision@10 = 0.019070037454366685
	  eval_avg_precision@1000 = 0.020381671

Step,Training Loss,Validation Loss


************* Evaluation *************


Evaluating on train set (time index:[2])....



Not all data has been set. Are you sure you passed all values?
Not all data has been set. Are you sure you passed all values?


***** train results (time index): [2])*****

  epoch = 1.0
  eval_mem_cpu_alloc_delta = -3121152
  eval_mem_cpu_peaked_delta = 89497600
  eval_mem_gpu_alloc_delta = 0
  eval_mem_gpu_peaked_delta = 3761792000
  train_avg_precision@10 = 0.022967674769461155
  train_avg_precision@1000 = 0.025625487323850393
  train_avg_precision@20 = 0.023957342840731144
  train_loss = -0.23869337514042854
  train_ndcg@10 = 0.024389258585870266
  train_ndcg@1000 = 0.06621946208178997
  train_ndcg@20 = 0.027674706652760506
  train_precision@10 = 0.0029541016556322575
  train_precision@1000 = 0.0003478393628029153
  train_precision@20 = 0.002093505929224193
  train_recall@10 = 0.029541015625
  train_recall@1000 = 0.34783935546875
  train_recall@20 = 0.0418701171875
  train_runtime = 2.8294
  train_samples_per_second = 28953.247

Evaluating on test set (time index:3)....

***** eval results (time index): 3)*****

	  epoch = 1.0
	  eval_avg_precision@10 = 0.022967674769461155
	  eval_avg_precision@1000 = 0.02

Step,Training Loss,Validation Loss


************* Evaluation *************


Evaluating on train set (time index:[3])....



Not all data has been set. Are you sure you passed all values?
Not all data has been set. Are you sure you passed all values?


***** train results (time index): [3])*****

  epoch = 1.0
  eval_mem_cpu_alloc_delta = 1458176
  eval_mem_cpu_peaked_delta = 85053440
  eval_mem_gpu_alloc_delta = 0
  eval_mem_gpu_peaked_delta = 3765396480
  train_avg_precision@10 = 0.0015704996019069637
  train_avg_precision@1000 = 0.0033315447425203665
  train_avg_precision@20 = 0.0019161895948595234
  train_loss = -0.306142053433827
  train_ndcg@10 = 0.003178828828302877
  train_ndcg@1000 = 0.03575688228011131
  train_ndcg@20 = 0.004456988868436643
  train_precision@10 = 0.0008893694529043776
  train_precision@1000 = 0.0002638113801367581
  train_precision@20 = 0.0006992885194319699
  train_recall@10 = 0.008893694196428572
  train_recall@1000 = 0.26381138392857145
  train_recall@20 = 0.013985770089285714
  train_runtime = 4.856
  train_samples_per_second = 16869.803

Evaluating on test set (time index:4)....

***** eval results (time index): 4)*****

	  epoch = 1.0
	  eval_avg_precision@10 = 0.0015704996019069637
	  eval_avg_precis