In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Session-based Recommendation with XLNET

In this notebook, we build a session-based recommendation model with XLNET, and train and evaluate it with NVTabular Pytorch Dataloader. 

In [2]:
import os
import torch 
import numpy
import pandas as pd 
import cudf
import cupy
import nvtabular as nvt

## Create a Synthetic Input Data

In [3]:
NUM_ROWS = 100000
inputs = {
    'session_id': numpy.random.randint(70000, 80000, NUM_ROWS),
    'day': numpy.random.randint(1, 10, NUM_ROWS),
    'item_id': numpy.random.randint(1, 51996, NUM_ROWS),
    'category': numpy.random.randint(0, 332, NUM_ROWS),
    'timestamp/age_days': numpy.random.uniform(0, 1, NUM_ROWS),
    'timestamp/weekday/sin' : numpy.random.uniform(0, 1, NUM_ROWS),
    'purchase': numpy.random.randint(0, 2, NUM_ROWS)
    }
random_data = cudf.DataFrame(inputs)
random_data.head()

Unnamed: 0,session_id,day,item_id,category,timestamp/age_days,timestamp/weekday/sin,purchase
0,70658,2,12541,27,0.738148,0.024413,0
1,71982,2,27523,299,0.274989,0.435644,1
2,75009,9,30666,46,0.339352,0.164251,1
3,70424,3,17118,219,0.778682,0.985127,1
4,78554,8,32957,25,0.753821,0.736395,0


## Feature Engineering with NVTabular Workflow

In [4]:
# Define Groupby Workflow
groupby_feats = ['session_id', 'day', 'item_id', 'category', 'timestamp/age_days', 'timestamp/weekday/sin', 'purchase']

groupby_features = groupby_feats >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    aggs={
        "item_id": ["list", "count"],
        "category": ["list"],     
        "day": ["first"],
        "purchase": ["first"],
        "timestamp/age_days": ["list"],
        'timestamp/weekday/sin': ["list"],
        },
    name_sep="-")

# Trim sessions to first 20 items 
groupby_features_nonlist = [x for x in groupby_features.output_columns.names if '-list' not in x]

groupby_features_trim = ((groupby_features - groupby_features_nonlist)) >> nvt.ops.ListSlice(0,20) >> nvt.ops.Rename(postfix = '_trim')

MINIMUM_SESSION_LENGTH = 2

selected_features = groupby_features[groupby_features_nonlist] + groupby_features_trim

filtered_sessions = (selected_features) >> nvt.ops.Filter(f=lambda df: df["item_id-count"] >= MINIMUM_SESSION_LENGTH)

workflow = nvt.Workflow(filtered_sessions)
dataset = nvt.Dataset(random_data, cpu=False)
workflow.fit(dataset)
sessions_gdf = workflow.transform(dataset).to_ddf().compute()



In [5]:
sessions_gdf.head(3)

Unnamed: 0,day-first,purchase-first,item_id-count,session_id,item_id-list_trim,timestamp/weekday/sin-list_trim,timestamp/age_days-list_trim,category-list_trim
0,3,0,6,70000,"[42968, 20258, 9094, 21342, 25064, 12432]","[0.8168488520400191, 0.6686634528158085, 0.326...","[0.4865734630505416, 0.7622569743055907, 0.029...","[292, 22, 252, 0, 202, 266]"
1,1,0,7,70001,"[10364, 21399, 8586, 13012, 24951, 4886, 7856]","[0.20090628414166323, 0.06770251893493051, 0.6...","[0.6204740594272689, 0.7052139557453468, 0.903...","[42, 326, 251, 58, 185, 74, 150]"
2,4,1,12,70002,"[11019, 1448, 42300, 10052, 24214, 34642, 2686...","[0.9203113024778837, 0.8309483352776478, 0.237...","[0.30113824260730027, 0.05405530256120705, 0.6...","[62, 41, 31, 247, 232, 271, 298, 219, 210, 205..."


- We can save the workflow.

In [6]:
workflow.save('workflow_etl')

### Export pre-processed data by day 

In [7]:
# requires cudf + cupy + nvtabular + dask_cudf
from transformers4rec.utils.gpu_preprocessing import save_time_based_splits
save_time_based_splits(data=nvt.Dataset(sessions_gdf),
                       output_dir= "./preproc_sessions_by_day",
                       partition_col='day-first',
                       timestamp_col='session_id', 
                      )

Creating time-based splits: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 10.54it/s]


## Build a DL model with Transformers4Rec library  

- import required libraries

In [8]:
import torch 
import transformers4rec.torch as torch4rec
from transformers4rec.torch import TabularSequenceFeatures, MLPBlock, SequentialBlock, Head, TransformerBlock

from transformers4rec.utils.schema import DatasetSchema
from transformers4rec.torch.model.head import NextItemPredictionTask
from transformers4rec.config import transformer
from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt, RecallAt

- Manually set the schema 

In [9]:
# Define schema object to pass it to the SequentialTabularFeatures
schema = DatasetSchema.from_proto("schema.pb")

### Define the sequential input module

Below we define our `input` bloc using [`SequentialTabularFeatures` class](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/features/sequential.py). The `from_schema` module directly parse schema and accepts categorical and continuous sequential inputs and supports data augmentation, data aggregation, `sequential-concat` and `elementwise-sum` aggregations, the projection of the interaction embeddings and the masking tasks.

`max_sequence_length` defines the maximum sequence length of our sequential input, and if `continuous_projection` argument is set,  all numerical features are concatenated and projected by a number of MLP layers.

In [10]:
inputs = TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=20,
        continuous_projection=64,
        d_output=100,
        masking="causal",
    )
inputs.masking.device = 'cuda'

- LM task + HF Transformer architecture + Next item-prediction task. 
- We build a [T4RecConfig](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/config/transformer.py#L8) class to update the config class of the transformer architecture with the specified arguments, then load the related model. Here we use it to instantiate an XLNET model according to the  arguments (d_model, n_head, etc.), defining the model architecture.
- [TransformerBlock](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/block/transformer.py#L37) class is created to support HF Transformers for session-based and sequential-based recommendation models.
- [NextItemPredictionTask](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/head.py#L212) is the class to support next item prediction task.

In [11]:
# Define XLNetConfig class and set default parameters 

# Set HF config of XLNet 
transformer_config = transformer.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=20
)
# Define the model block including: inputs, masking, projection and transformer block.
body = torch4rec.SequentialBlock(
    inputs, torch4rec.MLPBlock([64]), torch4rec.TransformerBlock(transformer_config, masking=inputs.masking)
)

# Define the head related to next item prediction task 
head = torch4rec.Head(
    body,
    torch4rec.NextItemPredictionTask(weight_tying=True, hf_format=True),
    inputs=inputs,
)

# Get the end-to-end Model class 
model = torch4rec.Model(head)

### Train the model 

- **Set Training arguments**

In [12]:
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch import Trainer
#Set argumentd for training 
train_args = T4RecTrainingArguments(local_rank = -1, dataloader_drop_last = True, data_loader_engine='nvtabular',
                                  report_to = [], debug = ["r"], gradient_accumulation_steps = 32,
                                  per_device_train_batch_size = 256, per_device_eval_batch_size = 32,
                                  output_dir = ".", lr_scheduler_type='cosine', 
                                  learning_rate_num_cosine_cycles_by_epoch=1.5,
                                  max_sequence_length=20, fp16=False, no_cuda=False)

- **Define paths to train and eval data**

In [13]:
import glob
train_data_paths = glob.glob("./preproc_sessions_by_day/*/train.parquet")[:-1]
eval_data_paths = glob.glob("./preproc_sessions_by_day/*/valid.parquet")[:-1]

- **Define Trainer**

In [14]:
# Instantiate the T4Rec Trainer, which manages training and evaluation
trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
    train_dataset_or_path=train_data_paths,
    eval_dataset_or_path=eval_data_paths,
)

- **Train the model**  

In [15]:
trainer.reset_lr_scheduler()
trainer.train()

***** Running training *****
  Num examples = 8704
  Num Epochs = 3
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 32768
  Gradient Accumulation steps = 32
  Total optimization steps = 3


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3, training_loss=11.425968170166016, metrics={'train_runtime': 3.9482, 'train_samples_per_second': 6.079, 'train_steps_per_second': 0.76, 'total_flos': 0.0, 'train_loss': 11.425968170166016, 'epoch': 2.94})

- **Compute evaluation metrics**

In [16]:
eval_metrics = trainer.evaluate(eval_dataset=eval_data_paths, metric_key_prefix='eval')
for key in sorted(eval_metrics.keys()):
    print("  %s = %s" % (key, str(eval_metrics[key])))

  epoch = 2.94
  eval_avgprecisionat_10 = 0.0
  eval_avgprecisionat_20 = 6.760639735148288e-06
  eval_loss = 10.928376197814941
  eval_ndcgat_10 = 0.0
  eval_ndcgat_20 = 2.9721029932261445e-05
  eval_recallat_10 = 0.0
  eval_recallat_20 = 0.00012845215678680688
  eval_runtime = 1.391
  eval_samples_per_second = 621.143
  eval_steps_per_second = 5.032


- **Compute Train metrics**

In [17]:
train_metrics = trainer.evaluate(eval_dataset=train_data_paths, metric_key_prefix='train')
for key in sorted(train_metrics.keys()):
    print("  %s = %s" % (key, str(train_metrics[key])))

  epoch = 2.94
  train_avgprecisionat_10 = 0.0
  train_avgprecisionat_20 = 9.212599252350628e-06
  train_loss = 10.931198120117188
  train_ndcgat_10 = 0.0
  train_ndcgat_20 = 4.050030111102387e-05
  train_recallat_10 = 0.0
  train_recallat_20 = 0.00017503938579466194
  train_runtime = 1.0375
  train_samples_per_second = 616.867
  train_steps_per_second = 4.819


* **Save the model**

In [18]:
trainer._save_model_and_checkpoint(save_model_class=True)

Saving model checkpoint to ./checkpoint-3
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


* **Reload model:**

In [19]:
trainer.load_model_trainer_states_from_checkpoint('./checkpoint-%s'%trainer.state.global_step)

- **Re-compute eval metrics of train data**

In [20]:
train_metrics = trainer.evaluate(eval_dataset=train_data_paths, metric_key_prefix='train')
for key in sorted(train_metrics.keys()):
    print("  %s = %s" % (key, str(train_metrics[key])))

  epoch = 2.94
  train_avgprecisionat_10 = 0.0
  train_avgprecisionat_20 = 9.212599252350628e-06
  train_loss = 10.931198120117188
  train_ndcgat_10 = 0.0
  train_ndcgat_20 = 4.050030111102387e-05
  train_recallat_10 = 0.0
  train_recallat_20 = 0.00017503938579466194
  train_runtime = 1.1741
  train_samples_per_second = 545.121
  train_steps_per_second = 4.259


* **Resume Training** 

In [21]:
# reset lr scheduler to train on new day data
trainer.reset_lr_scheduler()
# set new data from last day
trainer.train_dataset = train_data_paths[-1]
trainer.train(resume_from_checkpoint='./checkpoint-%s'%trainer.state.global_step)

Loading model from ./checkpoint-3).
***** Running training *****
  Num examples = 1024
  Num Epochs = 3
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 32768
  Gradient Accumulation steps = 32
  Total optimization steps = 3
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 3
  Continuing training from global step 3
  Will skip the first 3 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…



Training completed. Do not forget to share your model on huggingface.co/models =)




Step,Training Loss


TrainOutput(global_step=3, training_loss=0.0, metrics={'train_runtime': 0.183, 'train_samples_per_second': 671.957, 'train_steps_per_second': 16.389, 'total_flos': 0.0, 'train_loss': 0.0, 'epoch': 2.94})

- **Evaluate on last day**

In [22]:
# set new data from last day
eval_metrics = trainer.evaluate(eval_dataset=eval_data_paths[-1], metric_key_prefix='eval')
for key in sorted(eval_metrics.keys()):
    print("  %s = %s" % (key, str(eval_metrics[key])))

  epoch = 2.94
  eval_avgprecisionat_10 = 0.0
  eval_avgprecisionat_20 = 6.760639735148288e-06
  eval_loss = 10.928376197814941
  eval_ndcgat_10 = 0.0
  eval_ndcgat_20 = 2.9721029932261445e-05
  eval_recallat_10 = 0.0
  eval_recallat_20 = 0.00012845215678680688
  eval_runtime = 2.0212
  eval_samples_per_second = 427.467
  eval_steps_per_second = 3.463


# Daily Fine-Tuning: Training over a time window

Here we do daily fine-tuning meaning that we use the first day to train and second day to evaluate, then we use the second day data to train the model by resuming from the first step, and evaluate on the third day, so on so forth.

In [23]:
# Instantiate the T4Rec Trainer, which manages training and evaluation
trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
)

In [24]:
start_time_window_index = 1
final_time_window_index = 7
for time_index in range(start_time_window_index, final_time_window_index):
    # Set data 
    time_index_train = time_index
    time_index_eval = time_index + 1
    train_paths = glob.glob(f"./preproc_sessions_by_day/{time_index_train}/train.parquet")
    eval_paths = glob.glob(f"./preproc_sessions_by_day/{time_index_eval}/valid.parquet")  
    
    # Train on day related to time_index 
    print('*'*20)
    print("Launch training for day %s are:" %time_index)
    print('*'*20 + '\n')
    trainer.train_dataset = train_paths
    trainer.reset_lr_scheduler()
    trainer.train()
    trainer.state.global_step +=1
    # Evaluate on the following day
    trainer.eval_dataset = eval_paths
    train_metrics = trainer.evaluate(metric_key_prefix='eval')
    print('*'*20)
    print("Eval results for day %s are:\t" %time_index_eval)
    print('\n' + '*'*20 + '\n')
    for key in sorted(train_metrics.keys()):
        print(" %s = %s" % (key, str(train_metrics[key]))) 
    trainer.wipe_memory()

***** Running training *****
  Num examples = 1024
  Num Epochs = 3
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 32768
  Gradient Accumulation steps = 32
  Total optimization steps = 3


********************
Launch training for day 1 are:
********************



Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




***** Running training *****
  Num examples = 1024
  Num Epochs = 3
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 32768
  Gradient Accumulation steps = 32
  Total optimization steps = 3


********************
Eval results for day 2 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.0
 eval_avgprecisionat_20 = 0.0
 eval_loss = 10.92973804473877
 eval_ndcgat_10 = 0.0
 eval_ndcgat_20 = 0.0
 eval_recallat_10 = 0.0
 eval_recallat_20 = 0.0
 eval_runtime = 0.2179
 eval_samples_per_second = 440.569
 eval_steps_per_second = 4.589

********************
Launch training for day 2 are:
********************



Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running training *****
  Num examples = 1024
  Num Epochs = 3
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 32768
  Gradient Accumulation steps = 32
  Total optimization steps = 3


********************
Eval results for day 3 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.0
 eval_avgprecisionat_20 = 0.0
 eval_loss = 10.908461570739746
 eval_ndcgat_10 = 0.0
 eval_ndcgat_20 = 0.0
 eval_recallat_10 = 0.0
 eval_recallat_20 = 0.0
 eval_runtime = 0.2072
 eval_samples_per_second = 463.364
 eval_steps_per_second = 4.827
********************
Launch training for day 3 are:
********************



Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running training *****
  Num examples = 1024
  Num Epochs = 3
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 32768
  Gradient Accumulation steps = 32
  Total optimization steps = 3


********************
Eval results for day 4 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.0
 eval_avgprecisionat_20 = 0.0
 eval_loss = 10.920406341552734
 eval_ndcgat_10 = 0.0
 eval_ndcgat_20 = 0.0
 eval_recallat_10 = 0.0
 eval_recallat_20 = 0.0
 eval_runtime = 0.2059
 eval_samples_per_second = 466.183
 eval_steps_per_second = 4.856
********************
Launch training for day 4 are:
********************



Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running training *****
  Num examples = 1024
  Num Epochs = 3
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 32768
  Gradient Accumulation steps = 32
  Total optimization steps = 3


********************
Eval results for day 5 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.0
 eval_avgprecisionat_20 = 0.0
 eval_loss = 10.95324420928955
 eval_ndcgat_10 = 0.0
 eval_ndcgat_20 = 0.0
 eval_recallat_10 = 0.0
 eval_recallat_20 = 0.0
 eval_runtime = 0.2007
 eval_samples_per_second = 478.245
 eval_steps_per_second = 4.982
********************
Launch training for day 5 are:
********************



Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running training *****
  Num examples = 1024
  Num Epochs = 3
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 32768
  Gradient Accumulation steps = 32
  Total optimization steps = 3


********************
Eval results for day 6 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.00011185682524228469
 eval_avgprecisionat_20 = 0.00011185682524228469
 eval_loss = 10.917628288269043
 eval_ndcgat_10 = 0.00032333872513845563
 eval_ndcgat_20 = 0.00032333872513845563
 eval_recallat_10 = 0.0011185682378709316
 eval_recallat_20 = 0.0011185682378709316
 eval_runtime = 0.2146
 eval_samples_per_second = 447.411
 eval_steps_per_second = 4.661
********************
Launch training for day 6 are:
********************



Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




********************
Eval results for day 7 are:	

********************

 epoch = 3.0
 eval_avgprecisionat_10 = 0.0
 eval_avgprecisionat_20 = 0.0
 eval_loss = 10.940314292907715
 eval_ndcgat_10 = 0.0
 eval_ndcgat_20 = 0.0
 eval_recallat_10 = 0.0
 eval_recallat_20 = 0.0
 eval_runtime = 0.2051
 eval_samples_per_second = 468.02
 eval_steps_per_second = 4.875
