In [1]:
# Copyright 2022 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="https://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_transformers4rec_getting-started-session-based-02-session-based-xlnet-with-pyt/nvidia_logo.png" style="width: 90px; float: right;">

# Session-based Recommendation with XLNET

### Imports required libraries

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import cudf
import glob
import torch 

from transformers4rec import torch as tr
from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt, RecallAt
from transformers4rec.torch.utils.examples_utils import wipe_memory
from merlin.io import Dataset

  from .autonotebook import tqdm as notebook_tqdm
                not been set for this class (NDCGAt). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchmetrics.utilities import check_forward_full_state_property`
                that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
                default for now) or if `full_state_update=False` can be used safely.
                
                not been set for this class (DCGAt). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
   

Transformers4Rec library relies on a schema object to automatically build all necessary layers to represent, normalize and aggregate input features. As you can see below, `schema.pb` is a protobuf file that contains metadata including statistics about features such as cardinality, min and max values and also tags features based on their characteristics and dtypes (e.g., categorical, continuous, list, integer).

### Set the schema object

We create the schema object by reading the `schema.pbtxt` file generated by NVTabular pipeline in the previous, `01-ETL-with-NVTabular`, notebook.

In [3]:
from merlin_standard_lib import Schema
# import merlin.io
# from merlin.models.utils import schema_utils
# from merlin.schema import Schema, Tags
# from merlin.schema.io.tensorflow_metadata import TensorflowMetadata
# from merlin.schema import Schema
SCHEMA_PATH = os.environ.get("INPUT_SCHEMA_PATH", "/workspace/data/processed_nvt/schema.pbtxt")
schema = Schema().from_proto_text(SCHEMA_PATH)

In [4]:
schema

[{'name': 'session_id', 'type': 'INT', 'int_domain': {'name': 'session_id', 'max': '19877', 'is_categorical': True}, 'annotation': {'tag': ['categorical'], 'comment': ['{"num_buckets": null, "freq_threshold": 0.0, "max_size": 0.0, "start_index": 1.0, "cat_path": ".//categories/unique.session_id.parquet", "embedding_sizes": {"cardinality": 19878.0, "dimension": 409.0}, "dtype_item_size": 64.0, "is_list": false, "is_ragged": false}']}}, {'name': 'day-first', 'type': 'INT', 'annotation': {'comment': ['{"dtype_item_size": 64.0, "is_list": false, "is_ragged": false}']}}, {'name': 'item_id-count', 'type': 'INT', 'int_domain': {'name': 'item_id', 'max': '506', 'is_categorical': True}, 'annotation': {'tag': ['categorical'], 'comment': ['{"num_buckets": null, "freq_threshold": 0.0, "max_size": 0.0, "start_index": 1.0, "cat_path": ".//categories/unique.item_id.parquet", "embedding_sizes": {"cardinality": 507.0, "dimension": 52.0}, "dtype_item_size": 32.0, "is_list": false, "is_ragged": false}']}

In [5]:
# # You can select a subset of features for training

# You can select a subset of features for training
schema = schema.select_by_name(['item_id-list', 
                                'category-list',
                                'weekday_sin-list',
                                'age_days-list'
                               ])

In [6]:
schema

[{'name': 'item_id-list', 'value_count': {'min': '2', 'max': '15'}, 'type': 'INT', 'int_domain': {'name': 'item_id', 'max': '506', 'is_categorical': True}, 'annotation': {'tag': ['list', 'item_id', 'item', 'id', 'categorical'], 'comment': ['{"num_buckets": null, "freq_threshold": 0.0, "max_size": 0.0, "start_index": 1.0, "cat_path": ".//categories/unique.item_id.parquet", "embedding_sizes": {"cardinality": 507.0, "dimension": 52.0}, "dtype_item_size": 64.0, "is_list": true, "is_ragged": true}']}}, {'name': 'category-list', 'value_count': {'min': '2', 'max': '15'}, 'type': 'INT', 'int_domain': {'name': 'category', 'max': '137', 'is_categorical': True}, 'annotation': {'tag': ['list', 'categorical'], 'comment': ['{"num_buckets": null, "freq_threshold": 0.0, "max_size": 0.0, "start_index": 1.0, "cat_path": ".//categories/unique.category.parquet", "embedding_sizes": {"cardinality": 138.0, "dimension": 25.0}, "dtype_item_size": 64.0, "is_list": true, "is_ragged": true}']}}, {'name': 'age_day

### Define the sequential input module

In [7]:
inputs = tr.TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=15,
        continuous_projection=64,
        d_output=100,
        masking="causal",
)

In [8]:
# Define XLNetConfig class and set default parameters for HF XLNet config  
transformer_config = tr.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=20
)
# Define the model block including: inputs, masking, projection and transformer block.
body = tr.SequentialBlock(
    inputs, tr.MLPBlock([64]), tr.TransformerBlock(transformer_config, masking=inputs.masking)
)

# Defines the evaluation top-N metrics and the cut-offs
metrics = [NDCGAt(top_ks=[20, 40], labels_onehot=True),  
           RecallAt(top_ks=[20, 40], labels_onehot=True)]

# Define a head related to next item prediction task 
head = tr.Head(
    body,
    tr.NextItemPredictionTask(weight_tying=True, metrics=metrics),
    inputs=inputs,
)

# Get the end-to-end Model class 
model = tr.Model(head)

                not been set for this class (NDCGAt). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchmetrics.utilities import check_forward_full_state_property`
                that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
                default for now) or if `full_state_update=False` can be used safely.
                
                not been set for this class (DCGAt). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchmetrics.utilities import c

Note that we can easily define an RNN-based model inside the `SequentialBlock` instead of a Transformer-based model. You can explore this [tutorial](https://github.com/NVIDIA-Merlin/Transformers4Rec/tree/main/examples/tutorial) for a GRU-based model example.

### Train the model 

We use the NVTabular PyTorch Dataloader for optimized loading of multiple features from input parquet files. You can learn more about this data loader [here](https://nvidia-merlin.github.io/NVTabular/main/training/pytorch.html).

### **Set Training arguments**

In [9]:
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch import Trainer
# Set hyperparameters for training 
train_args = T4RecTrainingArguments(data_loader_engine='nvtabular', 
                                    dataloader_drop_last = True,
                                    gradient_accumulation_steps = 1,
                                    per_device_train_batch_size = 128, 
                                    per_device_eval_batch_size = 32,
                                    output_dir = "./tmp", 
                                    learning_rate=0.0005,
                                    lr_scheduler_type='cosine', 
                                    learning_rate_num_cosine_cycles_by_epoch=1.5,
                                    num_train_epochs=5,
                                    max_sequence_length=20, 
                                    report_to = [],
                                    logging_steps=50,
                                    no_cuda=False)

Note that we add an argument `data_loader_engine='nvtabular'` to automatically load the features needed for training using the schema. The default value is nvtabular for optimized GPU-based data-loading. Optionally a PyarrowDataLoader (pyarrow) can also be used as a basic option, but it is slower and works only for small datasets, as the full data is loaded to CPU memory.

## Daily Fine-Tuning: Training over a time window

Here we do daily fine-tuning meaning that we use the first day to train and second day to evaluate, then we use the second day data to train the model by resuming from the first step, and evaluate on the third day, so on so forth.

We have extended the HuggingFace transformers `Trainer` class (PyTorch only) to support evaluation of RecSys metrics. In this example, the evaluation of the session-based recommendation model is performed using traditional Top-N ranking metrics such as Normalized Discounted Cumulative Gain (NDCG@20) and Hit Rate (HR@20). NDCG accounts for rank of the relevant item in the recommendation list and is a more fine-grained metric than HR, which only verifies whether the relevant item is among the top-n items. HR@n is equivalent to Recall@n when there is only one relevant item in the recommendation list.

In [10]:
# Instantiate the T4Rec Trainer, which manages training and evaluation for the PyTorch API
trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
)

- Define the output folder of the processed parquet files

In [11]:
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/workspace/data")
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", f"{INPUT_DATA_DIR}/sessions_by_day")

In [12]:
%%time
start_time_window_index = 1
final_time_window_index = 3
#Iterating over days of one week
for time_index in range(start_time_window_index, final_time_window_index):
    # Set data 
    time_index_train = time_index
    time_index_eval = time_index + 1
    train_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_train}/train.parquet"))
    eval_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_eval}/valid.parquet"))
    print(train_paths)
    
    # Train on day related to time_index 
    print('*'*20)
    print("Launch training for day %s are:" %time_index)
    print('*'*20 + '\n')
    trainer.train_dataset_or_path = train_paths
    trainer.reset_lr_scheduler()
    trainer.train()
    trainer.state.global_step +=1
    print('finished')
    
    # Evaluate on the following day
    trainer.eval_dataset_or_path = eval_paths
    train_metrics = trainer.evaluate(metric_key_prefix='eval')
    print('*'*20)
    print("Eval results for day %s are:\t" %time_index_eval)
    print('\n' + '*'*20 + '\n')
    for key in sorted(train_metrics.keys()):
        print(" %s = %s" % (key, str(train_metrics[key]))) 
    wipe_memory()

***** Running training *****
  Num examples = 1664
  Num Epochs = 5
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 65


['/workspace/data/sessions_by_day/1/train.parquet']
********************
Launch training for day 1 are:
********************



Step,Training Loss
50,5.731




Training completed. Do not forget to share your model on huggingface.co/models =)




finished


********************
Eval results for day 2 are:	

********************

 eval_/loss = 4.892789840698242
 eval_/next-item/ndcg_at_20 = 0.2019212245941162
 eval_/next-item/ndcg_at_40 = 0.24986284971237183
 eval_/next-item/recall_at_20 = 0.5104166865348816
 eval_/next-item/recall_at_40 = 0.7447916865348816
 eval_runtime = 0.1608
 eval_samples_per_second = 1193.821
 eval_steps_per_second = 37.307
['/workspace/data/sessions_by_day/2/train.parquet']
********************
Launch training for day 2 are:
********************



***** Running training *****
  Num examples = 1664
  Num Epochs = 5
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 65


Step,Training Loss
50,4.7952




Training completed. Do not forget to share your model on huggingface.co/models =)




finished
********************
Eval results for day 3 are:	

********************

 eval_/loss = 4.611485481262207
 eval_/next-item/ndcg_at_20 = 0.1681433618068695
 eval_/next-item/ndcg_at_40 = 0.22220981121063232
 eval_/next-item/recall_at_20 = 0.484375
 eval_/next-item/recall_at_40 = 0.7447916865348816
 eval_runtime = 0.1687
 eval_samples_per_second = 1138.188
 eval_steps_per_second = 35.568
CPU times: user 15.9 s, sys: 247 ms, total: 16.1 s
Wall time: 5.28 s


### Re-compute eval metrics of validation data

In [13]:
eval_data_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_eval}/valid.parquet"))

# set new data from day 7
eval_metrics = trainer.evaluate(eval_dataset=eval_data_paths, metric_key_prefix='eval')
for key in sorted(eval_metrics.keys()):
    print("  %s = %s" % (key, str(eval_metrics[key])))

That's it!  
You have just trained your session-based recommendation model using Transformers4Rec.

In [14]:
model = model.cuda()
model.eval()

Model(
  (heads): ModuleList(
    (0): Head(
      (body): SequentialBlock(
        (0): TabularSequenceFeatures(
          (to_merge): ModuleDict(
            (continuous_module): SequentialBlock(
              (0): ContinuousFeatures(
                (filter_features): FilterFeatures()
                (_aggregation): ConcatFeatures()
              )
              (1): SequentialBlock(
                (0): DenseBlock(
                  (0): Linear(in_features=2, out_features=64, bias=True)
                  (1): ReLU(inplace=True)
                )
              )
              (2): AsTabular()
            )
            (categorical_module): SequenceEmbeddingFeatures(
              (filter_features): FilterFeatures()
              (embedding_tables): ModuleDict(
                (item_id-list): Embedding(507, 64, padding_idx=0)
                (category-list): Embedding(138, 64, padding_idx=0)
              )
            )
          )
          (_aggregation): ConcatFeatures()
        

In [15]:
model.hf_format = False

In [16]:
model.input_schema



Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.int_domain.min,properties.int_domain.max
0,age_days-list,"(Tags.CONTINUOUS, Tags.LIST)",float32,True,False,0,0
1,weekday_sin-list,"(Tags.CONTINUOUS, Tags.LIST)",float32,True,False,0,0
2,item_id-list,"(Tags.ID, Tags.ITEM, Tags.CATEGORICAL, Tags.IT...",int64,True,False,0,506
3,category-list,"(Tags.CATEGORICAL, Tags.LIST)",int64,True,False,0,137


Create a dict of tensors

In [17]:
dataset = Dataset(train_paths[0])
trainer.train_dataset_or_path = dataset
loader = trainer.get_train_dataloader()
train_dict = next(iter(loader))

In [19]:
traced_model = torch.jit.trace(model, train_dict, strict=True)

In [20]:
assert isinstance(traced_model, torch.jit.TopLevelTracedModule)

In [21]:
assert torch.allclose(
    model(train_dict),
    traced_model(train_dict),
)

In [22]:
input_schema = model.input_schema
output_schema = model.output_schema

In [23]:
input_schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.int_domain.min,properties.int_domain.max
0,age_days-list,"(Tags.CONTINUOUS, Tags.LIST)",float32,True,False,0,0
1,weekday_sin-list,"(Tags.CONTINUOUS, Tags.LIST)",float32,True,False,0,0
2,item_id-list,"(Tags.ID, Tags.ITEM, Tags.CATEGORICAL, Tags.IT...",int64,True,False,0,506
3,category-list,"(Tags.CATEGORICAL, Tags.LIST)",int64,True,False,0,137


In [24]:
from merlin.core.dispatch import make_df  # noqa
from merlin.systems.dag import Ensemble  # noqa
from merlin.systems.dag.ops.pytorch import PredictPyTorch  # noqa
from merlin.systems.triton.utils import run_ensemble_on_tritonserver  # noqa

torch_op = input_schema.column_names >> PredictPyTorch(
    traced_model, input_schema, output_schema
)

ensemble = Ensemble(torch_op, input_schema)
ens_config, node_configs = ensemble.export(str('./models'))



Create a dataframe to send as a request. We need a dataset where the list columns are padded to the max sequence lenght that was set in the ETL pipeline.

In [66]:
dataset = Dataset(eval_paths[0])
# trainer.test_dataset_or_path = dataset
loader = trainer.get_test_dataloader(dataset)
test_dict = next(iter(loader))

df_cols = {}
for name, tensor in train_dict.items():
    if name in input_schema.column_names:
        dtype = input_schema[name].dtype

        df_cols[name] = tensor.cpu().numpy().astype(dtype)
        if len(tensor.shape) > 1:
            df_cols[name] = list(df_cols[name])

df = make_df(df_cols)
print(df.shape)
df.head()

(128, 4)


Unnamed: 0,age_days-list,weekday_sin-list,item_id-list,category-list
0,"[0.37403864, 0.42758772, 0.93743354, 0.0, 0.0,...","[0.9351001, 0.91299504, 0.9785595, 0.0, 0.0, 0...","[30, 24, 200, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[7, 6, 40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,"[0.9327483, 0.36575532, 0.13967341, 0.45479113...","[0.5046626, 0.17492707, 0.12539314, 0.6640924,...","[190, 10, 7, 55, 27, 3, 6, 184, 0, 0, 0, 0, 0,...","[36, 3, 2, 11, 6, 4, 2, 36, 0, 0, 0, 0, 0, 0, ..."
2,"[0.57168996, 0.48532194, 0.89944935, 0.2171675...","[0.93685514, 0.5638695, 0.76670134, 0.6797855,...","[153, 8, 46, 58, 21, 19, 31, 15, 4, 104, 0, 0,...","[28, 2, 10, 11, 5, 5, 7, 3, 2, 18, 0, 0, 0, 0,..."
3,"[0.8520663, 0.6690395, 0.92268515, 0.99163777,...","[0.58499664, 0.45736608, 0.88926136, 0.9139287...","[19, 28, 23, 34, 18, 10, 0, 0, 0, 0, 0, 0, 0, ...","[5, 6, 6, 7, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0.67542243, 0.65952307, 0.7467189, 0.6136317,...","[0.09077961, 0.7920753, 0.35881928, 0.8545563,...","[17, 27, 70, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[5, 6, 14, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [68]:
# ===========================================
# Send request to Triton and check response
# ===========================================
response = run_ensemble_on_tritonserver(
    './models', input_schema, df[input_schema.column_names], output_schema.column_names, "ensemble_model"
)

I1123 22:15:22.928458 2118 pinned_memory_manager.cc:240] Pinned memory pool is created at '0x7f428e000000' with size 268435456
I1123 22:15:22.928899 2118 cuda_memory_manager.cc:105] CUDA memory pool is created on device 0 with size 67108864
I1123 22:15:22.931602 2118 model_lifecycle.cc:459] loading: 0_predictpytorch:1
I1123 22:15:23.299444 2118 libtorch.cc:1983] TRITONBACKEND_Initialize: pytorch
I1123 22:15:23.299463 2118 libtorch.cc:1993] Triton TRITONBACKEND API version: 1.10
I1123 22:15:23.299469 2118 libtorch.cc:1999] 'pytorch' TRITONBACKEND API version: 1.10
I1123 22:15:23.299488 2118 libtorch.cc:2032] TRITONBACKEND_ModelInitialize: 0_predictpytorch (version 1)
W1123 22:15:23.300039 2118 libtorch.cc:284] skipping model configuration auto-complete for '0_predictpytorch': not supported for pytorch backend
I1123 22:15:23.300768 2118 libtorch.cc:313] Optimized execution is enabled for model instance '0_predictpytorch'
I1123 22:15:23.300780 2118 libtorch.cc:332] Cache Cleaning is disab

Signal (2) received.


I1123 22:15:26.364164 2118 server.cc:262] Waiting for in-flight requests to complete.
I1123 22:15:26.364179 2118 server.cc:278] Timeout 30: Found 0 model versions that have in-flight inferences
I1123 22:15:26.364255 2118 server.cc:293] All models are stopped, unloading models
I1123 22:15:26.364263 2118 server.cc:300] Timeout 30: Found 2 live models and 0 in-flight non-inference requests
I1123 22:15:26.364287 2118 model_lifecycle.cc:578] successfully unloaded 'ensemble_model' version 1
I1123 22:15:26.364592 2118 libtorch.cc:2110] TRITONBACKEND_ModelInstanceFinalize: delete instance state
I1123 22:15:26.372137 2118 libtorch.cc:2055] TRITONBACKEND_ModelFinalize: delete model state
I1123 22:15:26.372333 2118 model_lifecycle.cc:578] successfully unloaded '0_predictpytorch' version 1
I1123 22:15:27.364444 2118 server.cc:300] Timeout 29: Found 0 live models and 0 in-flight non-inference requests


In [70]:
response

{'next-item': array([[-7.9947233, -8.747803 , -3.4068425, ..., -7.914096 , -8.571636 ,
         -7.815837 ],
        [-7.977386 , -8.727583 , -3.3388414, ..., -7.9091067, -8.508778 ,
         -7.7708635],
        [-8.000487 , -8.737406 , -3.4030848, ..., -7.921053 , -8.557445 ,
         -7.798526 ],
        ...,
        [-7.998163 , -8.739789 , -3.3824148, ..., -7.9103565, -8.550226 ,
         -7.8002963],
        [-7.9968286, -8.753717 , -3.3801503, ..., -7.9066863, -8.55961  ,
         -7.794828 ],
        [-8.01243  , -8.753323 , -3.3656597, ..., -7.8982997, -8.546498 ,
         -7.7921886]], dtype=float32)}

We return a response for each request in the df. Each row in the `response['next-item']` array corresponds to the logit values per item in the catalog and for the OOV item.

In [69]:
response['next-item'].shape

(128, 507)