In [24]:
import os
import torch 
import numpy
import pandas as pd 
import cudf
import cupy
import nvtabular as nvt

### Create random input data similar to pre-processed Yoochoose dataset structure 

In [25]:
NUM_ROWS = 200000
session_length = 20
inputs = {
    'session_id': numpy.random.randint(70000, 80000, NUM_ROWS),
    'day': numpy.random.randint(1, 10, NUM_ROWS),
    'item_id': numpy.random.randint(1, 51996, NUM_ROWS),
    'category': numpy.random.randint(0, 332, NUM_ROWS),
    'timestamp/age_days': numpy.random.uniform(0, 1, NUM_ROWS),
    'timestamp/weekday/sin' : numpy.random.uniform(0, 1, NUM_ROWS),
    'purchase': numpy.random.randint(0, 2, NUM_ROWS)
    }
random_data = cudf.DataFrame(inputs)
random_data

Unnamed: 0,session_id,day,item_id,category,timestamp/age_days,timestamp/weekday/sin,purchase
0,77270,7,26757,321,0.508369,0.994320,0
1,70860,2,21548,118,0.264226,0.939826,1
2,75390,4,5289,69,0.049086,0.085271,1
3,75191,5,15107,81,0.249507,0.388882,1
4,75734,3,37267,67,0.757183,0.145604,1
...,...,...,...,...,...,...,...
199995,77126,3,37088,281,0.635821,0.561732,0
199996,72318,4,35382,267,0.448604,0.724329,1
199997,78772,8,8071,294,0.592731,0.980665,1
199998,78542,4,44769,91,0.868292,0.893382,0


### NVTabular workflow

- #TODO : Change the workflow using tagging API once it is finalized  

In [26]:
# Define Groupby Workflow
groupby_features = list(inputs.keys()) >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    aggs={
        "item_id": ["list"],
        "category": ["list"],     
        "day": ["first"],
        "purchase": ["first"],
        "timestamp/age_days": ["list"],
        'timestamp/weekday/sin': ["list"],
        },
    name_sep="-")
# Trim sessions to first 20 items 
groupby_features_list = [x for x in groupby_features.output_columns.names if '-list' in x]

#groupby_features_nonlist:  need to fix a BUG related to adding two workflow nodes
#groupby_features_trim = groupby_features_list >> nvt.ops.ListSlice(0,20) >> nvt.ops.Rename(postfix = '_trim')
workflow = nvt.Workflow(groupby_features)
dataset = nvt.Dataset(random_data, cpu=False)
workflow.fit(dataset)
sessions_gdf = workflow.transform(dataset).to_ddf().compute()

# Re-compute tri: to be removed when the BUG of two workflow nodes is fixed : 
groupby_features_trim =  groupby_features_list >> nvt.ops.ListSlice(0,20) >> nvt.ops.Rename(postfix = '_trim')
workflow = nvt.Workflow(list(sessions_gdf.columns)  + groupby_features_trim)
dataset = nvt.Dataset(sessions_gdf, cpu=False)
workflow.fit(dataset)
sessions_gdf = workflow.transform(dataset).to_ddf().compute()

In [27]:
sessions_gdf.head(3)

Unnamed: 0,category-list_trim,timestamp/age_days-list_trim,item_id-list_trim,timestamp/weekday/sin-list_trim,session_id,category-list,purchase-first,timestamp/age_days-list,item_id-list,timestamp/weekday/sin-list,day-first
0,"[146, 228, 269, 262, 252, 144, 241, 139, 85, 2...","[0.8433147126150244, 0.36594029226833047, 0.48...","[35969, 7218, 4254, 43307, 24474, 46468, 17340...","[0.8425804078540428, 0.4314946368378889, 0.217...",70000,"[146, 228, 269, 262, 252, 144, 241, 139, 85, 2...",0,"[0.8433147126150244, 0.36594029226833047, 0.48...","[35969, 7218, 4254, 43307, 24474, 46468, 17340...","[0.8425804078540428, 0.4314946368378889, 0.217...",8
1,"[164, 131, 117, 294, 283, 207, 94, 8, 112, 232...","[0.09507837731997637, 0.5639704696082977, 0.67...","[163, 6506, 35424, 24708, 1004, 2914, 20464, 2...","[0.4388505546993573, 0.8208003416295997, 0.297...",70001,"[164, 131, 117, 294, 283, 207, 94, 8, 112, 232...",1,"[0.09507837731997637, 0.5639704696082977, 0.67...","[163, 6506, 35424, 24708, 1004, 2914, 20464, 2...","[0.4388505546993573, 0.8208003416295997, 0.297...",2
2,"[130, 323, 18, 328, 191, 224, 107, 44, 65, 63]","[0.2557564379821403, 0.37390035704678004, 0.07...","[40696, 31278, 1489, 16792, 34465, 17258, 4452...","[0.7795720554649077, 0.8084530571788069, 0.730...",70002,"[130, 323, 18, 328, 191, 224, 107, 44, 65, 63]",0,"[0.2557564379821403, 0.37390035704678004, 0.07...","[40696, 31278, 1489, 16792, 34465, 17258, 4452...","[0.7795720554649077, 0.8084530571788069, 0.730...",8


- We can save the workflow.

In [28]:
workflow.save('workflow_inference_test')

### Export pre-processed data by day 

In [29]:
# requires cudf + cupy + nvtabular + dask_cudf
from transformers4rec.utils.processing_utils import save_time_based_splits
save_time_based_splits(data=nvt.Dataset(sessions_gdf),
                       output_dir= "./preproc_sessions_by_day_ts",
                       partition_col='day-first',
                       timestamp_col='session_id', 
                      )

Creating time-based splits: 100%|██████████| 9/9 [00:03<00:00,  2.45it/s]


# Transformers4rec model  

In [30]:
import torch 
import transformers4rec.torch as torch4rec
from transformers4rec.torch import TabularSequenceFeatures, MLPBlock, SequentialBlock, Head, TransformerBlock

from transformers4rec.utils.schema import DatasetSchema
from transformers4rec.torch.head import NextItemPredictionTask
from transformers4rec.config import transformer
from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt, RecallAt

- Manually set the schema 

In [31]:
# Define schema object to pass it to the SequentialTabularFeatures
schema = DatasetSchema.from_schema("schema.pb")

### Define the sequential input module

Below we define our `input` bloc using [`SequentialTabularFeatures` class](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/features/sequential.py). The `from_schema` module directly parse schema and accepts categorical and continuous sequential inputs and supports data augmentation, data aggregation, `sequential-concat` and `elementwise-sum` aggregations, the projection of the interaction embeddings and the masking tasks.

`max_sequence_length` defines the maximum sequence length of our sequential input, and if `continuous_projection` argument is set,  all numerical features are concatenated and projected by a number of MLP layers.

In [32]:
inputs = TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=20,
        continuous_projection=64,
        d_output=100,
        masking="causal",
    )

### End-to-end session-based Transformer-based model for item prediction:

- LM task + HF Transformer architecture + Next item-prediction task. 
- We build a [T4RecConfig](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/config/transformer.py#L8) class to update the config class of the transformer architecture with the specified arguments, then load the related model. Here we use it to instantiate an XLNET model according to the  arguments (d_model, n_head, etc.), defining the model architecture.
- [TransformerBlock](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/block/transformer.py#L37) class is created to support HF Transformers for session-based and sequential-based recommendation models.
- [NextItemPredictionTask](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/head.py#L212) is the class to support next item prediction task.

In [33]:
#### Define XLNetConfig class and set default parameters 

# Set HF config of XLNet 
transformer_config = transformer.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=20
)
# Define the model block including: inputs, masking, projection and transformer block.
body = torch4rec.SequentialBlock(
    inputs, torch4rec.MLPBlock([64]), torch4rec.TransformerBlock(transformer=transformer_config, masking=inputs.masking)
)

# Define the head related to next item prediction task 
head = torch4rec.Head(
    body,
    torch4rec.NextItemPredictionTask(weight_tying=True, hf_format=True),
    inputs=inputs,
    
)

# Get the end-to-end Model class 
model = torch4rec.Model(head, device='cuda')

# Training and evaluation

- **Set Training arguments**

In [34]:
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch import Trainer
#Set argumentd for training 
train_args = T4RecTrainingArguments(local_rank = -1, dataloader_drop_last = True, data_loader_engine='nvtabular',
                                  report_to = [], debug = ["r"], gradient_accumulation_steps = 32,
                                  per_device_train_batch_size = 512, per_device_eval_batch_size = 32,
                                  output_dir = ".", use_legacy_prediction_loop = False,
                                  max_sequence_length=20)

- **Define paths to train and eval data**

In [35]:
import glob
train_data_paths = glob.glob("./preproc_sessions_by_day_ts/*/train.parquet")[:-1]
eval_data_paths = glob.glob("./preproc_sessions_by_day_ts/*/valid.parquet")[:-1]

- **Define Trainer**

In [36]:
# Instantiate the T4Rec Trainer, which manages training and evaluation
trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
    train_dataset_or_path=train_data_paths,
    eval_dataset_or_path=eval_data_paths,
)

- **Train the model**  

In [37]:
trainer.reset_lr_scheduler()
trainer.train()

Step,Training Loss


TrainOutput(global_step=3, training_loss=5.827318827311198, metrics={'train_runtime': 6.673, 'train_samples_per_second': 0.45, 'total_flos': 0.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 131072, 'train_mem_gpu_alloc_delta': 42939904, 'train_mem_cpu_peaked_delta': 3162112, 'train_mem_gpu_peaked_delta': 7599885824})

- **Compute evaluation metrics**

In [38]:
eval_metrics = trainer.evaluate(eval_dataset=eval_data_paths, metric_key_prefix='eval')
for key in sorted(eval_metrics.keys()):
    print("  %s = %s" % (key, str(eval_metrics[key])))

  epoch = 3.0
  eval_avgprecisionat_10 = 3.509510861476883e-05
  eval_avgprecisionat_20 = 5.472534030559473e-05
  eval_loss = 10.933452606201172
  eval_mem_cpu_alloc_delta = 12288
  eval_mem_cpu_peaked_delta = 2265088
  eval_mem_gpu_alloc_delta = 275456
  eval_mem_gpu_peaked_delta = 728570368
  eval_ndcgat_10 = 6.961845065234229e-05
  eval_ndcgat_20 = 0.00014665150956716388
  eval_recallat_10 = 0.00018951357924379408
  eval_recallat_20 = 0.0005053695640526712
  eval_runtime = 6.3765
  eval_samples_per_second = 145.534


- **Compute Train metrics**

In [39]:
train_metrics = trainer.evaluate(eval_dataset=train_data_paths, metric_key_prefix='train')
for key in sorted(train_metrics.keys()):
    print("  %s = %s" % (key, str(train_metrics[key])))

  epoch = 3.0
  eval_mem_cpu_alloc_delta = 458752
  eval_mem_cpu_peaked_delta = 2932736
  eval_mem_gpu_alloc_delta = 1244160
  eval_mem_gpu_peaked_delta = 732026368
  train_avgprecisionat_10 = 1.2897899978270289e-05
  train_avgprecisionat_20 = 2.3556582164019346e-05
  train_loss = 10.931257247924805
  train_ndcgat_10 = 3.009509964613244e-05
  train_ndcgat_20 = 7.34374116291292e-05
  train_recallat_10 = 9.028529893839732e-05
  train_recallat_20 = 0.00027085590409114957
  train_runtime = 4.8739
  train_samples_per_second = 131.312


* **Save the model**

In [40]:
trainer._save_model_and_checkpoint()

* **Reload model:**

In [41]:
trainer.load_model_trainer_states_from_checkpoint('./checkpoint-%s'%trainer.state.global_step)

- **Re-compute eval metrics of train data**

In [42]:
train_metrics = trainer.evaluate(eval_dataset=train_data_paths, metric_key_prefix='train')
for key in sorted(train_metrics.keys()):
    print("  %s = %s" % (key, str(train_metrics[key])))

  epoch = 3.0
  eval_mem_cpu_alloc_delta = 765952
  eval_mem_cpu_peaked_delta = 2945024
  eval_mem_gpu_alloc_delta = 1365504
  eval_mem_gpu_peaked_delta = 732042752
  train_avgprecisionat_10 = 1.2897899978270289e-05
  train_avgprecisionat_20 = 2.3556582164019346e-05
  train_loss = 10.931257247924805
  train_ndcgat_10 = 3.009509964613244e-05
  train_ndcgat_20 = 7.34374116291292e-05
  train_recallat_10 = 9.028529893839732e-05
  train_recallat_20 = 0.00027085590409114957
  train_runtime = 5.7332
  train_samples_per_second = 111.63


* **Resume Training** 

In [43]:
# reset lr scheduler to train on new day data
trainer.reset_lr_scheduler()
# set new data from last day
trainer.train_dataset = train_data_paths[-1]
trainer.train(resume_from_checkpoint='./checkpoint-3')

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

Step,Training Loss


TrainOutput(global_step=3, training_loss=0.0, metrics={'train_runtime': 0.4482, 'train_samples_per_second': 6.693, 'total_flos': 0, 'epoch': 3.0, 'train_mem_cpu_alloc_delta': 303104, 'train_mem_gpu_alloc_delta': 512, 'train_mem_cpu_peaked_delta': 3067904, 'train_mem_gpu_peaked_delta': 56577536})

- **Evaluate on last day**

In [44]:
# set new data from last day
eval_metrics = trainer.evaluate(eval_dataset=eval_data_paths[-1], metric_key_prefix='eval')
for key in sorted(eval_metrics.keys()):
    print("  %s = %s" % (key, str(eval_metrics[key])))


  epoch = 3.0
  eval_avgprecisionat_10 = 0.0
  eval_avgprecisionat_20 = 3.010234831890557e-05
  eval_loss = 10.933640480041504
  eval_mem_cpu_alloc_delta = 0
  eval_mem_cpu_peaked_delta = 2207744
  eval_mem_gpu_alloc_delta = -231424
  eval_mem_gpu_peaked_delta = 717337088
  eval_ndcgat_10 = 0.0
  eval_ndcgat_20 = 0.00013706818572245538
  eval_recallat_10 = 0.0
  eval_recallat_20 = 0.0006020469591021538
  eval_runtime = 0.8499
  eval_samples_per_second = 112.953


### Incremental Training over a time window 

In [45]:
# Instantiate the T4Rec Trainer, which manages training and evaluation
trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
)

In [50]:
start_time_window_index = 1
final_time_window_index = 7
for time_index in range(start_time_window_index, final_time_window_index):
    # Set data 
    time_index_train = time_index
    time_index_eval = time_index + 1
    train_paths = glob.glob(f"./preproc_sessions_by_day_ts/{time_index_train}/train.parquet")
    eval_paths = glob.glob(f"./preproc_sessions_by_day_ts/{time_index_eval}/valid.parquet")  
    
    # Train on day related to time_index 
    print('*'*20)
    print("Launch training for day %s are:" %time_index)
    print('*'*20 + '\n')
    trainer.train_dataset = train_paths
    trainer.reset_lr_scheduler()
    trainer.train()
    
    # Evaluate on the following day
    trainer.eval_dataset = eval_paths
    train_metrics = trainer.evaluate(metric_key_prefix='eval')
    print('*'*20)
    print("Eval results for day %s are:\t" %time_index_eval)
    print('\n' + '*'*20 + '\n')
    for key in sorted(train_metrics.keys()):
        print(" %s = %s" % (key, str(train_metrics[key]))) 
    trainer.wipe_memory()

********************
Launch training for day 1 are:
********************



Step,Training Loss


********************
Eval results for day 2 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.0003058103902731091
 eval_avgprecisionat_20 = 0.00040218696813099086
 eval_loss = 10.881290435791016
 eval_mem_cpu_alloc_delta = 0
 eval_mem_cpu_peaked_delta = 2203648
 eval_mem_gpu_alloc_delta = -80896
 eval_mem_gpu_peaked_delta = 694926336
 eval_ndcgat_10 = 0.0003858897543977946
 eval_ndcgat_20 = 0.0007094023167155683
 eval_recallat_10 = 0.0006116207805462182
 eval_recallat_20 = 0.0018348623998463154
 eval_runtime = 0.9769
 eval_samples_per_second = 98.27
********************
Launch training for day 2 are:
********************



Step,Training Loss


********************
Eval results for day 3 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.0
 eval_avgprecisionat_20 = 8.886857540346682e-05
 eval_loss = 10.90975570678711
 eval_mem_cpu_alloc_delta = 0
 eval_mem_cpu_peaked_delta = 2228224
 eval_mem_gpu_alloc_delta = -80896
 eval_mem_gpu_peaked_delta = 706768384
 eval_ndcgat_10 = 0.0
 eval_ndcgat_20 = 0.0003137651947326958
 eval_recallat_10 = 0.0
 eval_recallat_20 = 0.0012187690008431673
 eval_runtime = 1.0994
 eval_samples_per_second = 87.317
********************
Launch training for day 3 are:
********************



Step,Training Loss


********************
Eval results for day 4 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.0
 eval_avgprecisionat_20 = 0.0
 eval_loss = 10.90018367767334
 eval_mem_cpu_alloc_delta = 0
 eval_mem_cpu_peaked_delta = 2240512
 eval_mem_gpu_alloc_delta = -80896
 eval_mem_gpu_peaked_delta = 694238720
 eval_ndcgat_10 = 0.0
 eval_ndcgat_20 = 0.0
 eval_recallat_10 = 0.0
 eval_recallat_20 = 0.0
 eval_runtime = 0.8968
 eval_samples_per_second = 107.052
********************
Launch training for day 4 are:
********************



Step,Training Loss


********************
Eval results for day 5 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.0
 eval_avgprecisionat_20 = 0.0
 eval_loss = 10.874518394470215
 eval_mem_cpu_alloc_delta = 0
 eval_mem_cpu_peaked_delta = 2207744
 eval_mem_gpu_alloc_delta = -80896
 eval_mem_gpu_peaked_delta = 698803200
 eval_ndcgat_10 = 0.0
 eval_ndcgat_20 = 0.0
 eval_recallat_10 = 0.0
 eval_recallat_20 = 0.0
 eval_runtime = 0.9528
 eval_samples_per_second = 100.752
********************
Launch training for day 5 are:
********************



Step,Training Loss


********************
Eval results for day 6 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.0
 eval_avgprecisionat_20 = 4.256768443156034e-05
 eval_loss = 10.89849853515625
 eval_mem_cpu_alloc_delta = 0
 eval_mem_cpu_peaked_delta = 2199552
 eval_mem_gpu_alloc_delta = -80896
 eval_mem_gpu_peaked_delta = 725211136
 eval_ndcgat_10 = 0.0
 eval_ndcgat_20 = 0.00015253755555022508
 eval_recallat_10 = 0.0
 eval_recallat_20 = 0.0005959475529380143
 eval_runtime = 0.9274
 eval_samples_per_second = 103.519
********************
Launch training for day 6 are:
********************



Step,Training Loss


********************
Eval results for day 7 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.00012040939327562228
 eval_avgprecisionat_20 = 0.00012040939327562228
 eval_loss = 10.92831802368164
 eval_mem_cpu_alloc_delta = 0
 eval_mem_cpu_peaked_delta = 2207744
 eval_mem_gpu_alloc_delta = -80896
 eval_mem_gpu_peaked_delta = 717337088
 eval_ndcgat_10 = 0.00023290354874916375
 eval_ndcgat_20 = 0.00023290354874916375
 eval_recallat_10 = 0.0006020469591021538
 eval_recallat_20 = 0.0006020469591021538
 eval_runtime = 0.9351
 eval_samples_per_second = 102.661
