In [1]:
import os
import torch 
import numpy
import pandas as pd 
import cudf
import cupy
import nvtabular as nvt

### Create random input data similar to pre-processed Yoochoose dataset structure 

In [2]:
NUM_ROWS = 200000
session_length = 20
inputs = {
    'session_id': numpy.random.randint(70000, 80000, NUM_ROWS),
    'day': numpy.random.randint(1, 10, NUM_ROWS),
    'item_id': numpy.random.randint(1, 51996, NUM_ROWS),
    'category': numpy.random.randint(0, 332, NUM_ROWS),
    'timestamp/age_days': numpy.random.uniform(0, 1, NUM_ROWS),
    'timestamp/weekday/sin' : numpy.random.uniform(0, 1, NUM_ROWS),
    'purchase': numpy.random.randint(0, 2, NUM_ROWS)
    }
random_data = cudf.DataFrame(inputs)
random_data

Unnamed: 0,session_id,day,item_id,category,timestamp/age_days,timestamp/weekday/sin,purchase
0,72551,9,19127,106,0.228705,0.718865,0
1,72085,4,41292,208,0.080211,0.052820,0
2,77017,9,49099,91,0.539491,0.717076,0
3,76445,1,41223,81,0.923112,0.421609,0
4,76360,5,25711,172,0.827265,0.188140,0
...,...,...,...,...,...,...,...
199995,77441,3,16487,63,0.130175,0.343054,1
199996,71514,6,35285,252,0.394108,0.773249,1
199997,75334,8,44853,90,0.908994,0.452033,0
199998,74428,4,10683,266,0.506767,0.612449,1


### NVTabular workflow

- #TODO : Change the workflow using tagging API once it is finalized  

In [3]:
# Define Groupby Workflow
groupby_features = list(inputs.keys()) >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    aggs={
        "item_id": ["list"],
        "category": ["list"],     
        "day": ["first"],
        "purchase": ["first"],
        "timestamp/age_days": ["list"],
        'timestamp/weekday/sin': ["list"],
        },
    name_sep="-")
# Trim sessions to first 20 items 
groupby_features_list = [x for x in groupby_features.output_columns.names if '-list' in x]

#groupby_features_nonlist:  need to fix a BUG related to adding two workflow nodes
#groupby_features_trim = groupby_features_list >> nvt.ops.ListSlice(0,20) >> nvt.ops.Rename(postfix = '_trim')
workflow = nvt.Workflow(groupby_features)
dataset = nvt.Dataset(random_data, cpu=False)
workflow.fit(dataset)
sessions_gdf = workflow.transform(dataset).to_ddf().compute()

# Re-compute tri: to be removed when the BUG of two workflow nodes is fixed : 
groupby_features_trim =  groupby_features_list >> nvt.ops.ListSlice(0,20) >> nvt.ops.Rename(postfix = '_trim')
workflow = nvt.Workflow(list(sessions_gdf.columns)  + groupby_features_trim)
dataset = nvt.Dataset(sessions_gdf, cpu=False)
workflow.fit(dataset)
sessions_gdf = workflow.transform(dataset).to_ddf().compute()

In [4]:
sessions_gdf.head(3)

Unnamed: 0,timestamp/age_days-list_trim,item_id-list_trim,category-list_trim,timestamp/weekday/sin-list_trim,timestamp/age_days-list,item_id-list,category-list,timestamp/weekday/sin-list,day-first,session_id,purchase-first
0,"[0.4196480297033909, 0.047959138751666686, 0.9...","[10056, 17344, 28903, 32735, 27977, 47981, 297...","[4, 133, 104, 241, 69, 183, 116, 102, 155, 31,...","[0.6764826947365993, 0.50910225618793, 0.72312...","[0.4196480297033909, 0.047959138751666686, 0.9...","[10056, 17344, 28903, 32735, 27977, 47981, 297...","[4, 133, 104, 241, 69, 183, 116, 102, 155, 31,...","[0.6764826947365993, 0.50910225618793, 0.72312...",4,70000,0
1,"[0.6019363898760877, 0.6859739107134702, 0.417...","[44601, 37626, 18035, 32663, 44133, 33738, 450...","[107, 152, 268, 237, 38, 186, 11, 214, 231, 21...","[0.3839594036335826, 0.8950368033691941, 0.866...","[0.6019363898760877, 0.6859739107134702, 0.417...","[44601, 37626, 18035, 32663, 44133, 33738, 450...","[107, 152, 268, 237, 38, 186, 11, 214, 231, 21...","[0.3839594036335826, 0.8950368033691941, 0.866...",4,70001,0
2,"[0.6770391070589014, 0.8220821243093939, 0.288...","[50401, 4531, 36090, 14544, 48050, 2897, 40726...","[123, 103, 268, 97, 78, 241, 283, 312, 42, 193...","[0.20149853173406074, 0.08520662535055312, 0.5...","[0.6770391070589014, 0.8220821243093939, 0.288...","[50401, 4531, 36090, 14544, 48050, 2897, 40726...","[123, 103, 268, 97, 78, 241, 283, 312, 42, 193...","[0.20149853173406074, 0.08520662535055312, 0.5...",4,70002,1


- We can save the workflow.

In [5]:
workflow.save('workflow_inference_test')

### Export pre-processed data by day 

In [6]:
# Convert to a Dataset and write out hive-partitioned data to disk
nvt_output_path_tmp ='./output_nvt_tmp/'
PARTITION_COL = 'day-first'
nvt.Dataset(sessions_gdf).to_parquet(nvt_output_path_tmp, partition_on=[PARTITION_COL])

In [7]:
OUTPUT_FOLDER = "./preproc_sessions_by_day_ts/"
!mkdir -p $OUTPUT_FOLDER
days_folders = [f for f in sorted(os.listdir(nvt_output_path_tmp)) if f.startswith(PARTITION_COL)]
for day_folder in days_folders:
    df = cudf.read_parquet(os.path.join(nvt_output_path_tmp, day_folder))
    out_folder = os.path.join(OUTPUT_FOLDER, day_folder.replace('day-first=', ''))
    os.makedirs(out_folder, exist_ok=True)
    df.to_parquet(os.path.join(out_folder, 'train.parquet'))
    
    random_values = cupy.random.rand(len(df))
    
    #Extracts 10% for valid and test set. Those sessions are also in the train set, but as evaluation
    #happens only for the subsequent day of training, that is not an issue, and we can keep the train set larger.
    valid_set = df[random_values <= 0.10]
    valid_set.to_parquet(os.path.join(out_folder, 'valid.parquet'))
    
    test_set = df[random_values >= 0.90]
    test_set.to_parquet(os.path.join(out_folder, 'test.parquet'))

# Transformers4rec model  

In [8]:
import torch 
import transformers4rec.torch as torch4rec
from transformers4rec.torch import TabularSequenceFeatures, MLPBlock, SequentialBlock, Head, TransformerBlock

from transformers4rec.utils.schema import DatasetSchema
from transformers4rec.torch.head import NextItemPredictionTask
from transformers4rec.config import transformer
from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt, RecallAt

- Manually set the schema 

In [9]:
# Define schema object to pass it to the SequentialTabularFeatures
schema = DatasetSchema.from_schema("schema.pb")

### Define the sequential input module

Below we define our `input` bloc using [`SequentialTabularFeatures` class](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/features/sequential.py). The `from_schema` module directly parse schema and accepts categorical and continuous sequential inputs and supports data augmentation, data aggregation, `sequential-concat` and `elementwise-sum` aggregations, the projection of the interaction embeddings and the masking tasks.

`max_sequence_length` defines the maximum sequence length of our sequential input, and if `continuous_projection` argument is set,  all numerical features are concatenated and projected by a number of MLP layers.

In [10]:
inputs = TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=20,
        continuous_projection=64,
        d_output=100,
        masking="causal",
    )

### End-to-end session-based Transformer-based model for item prediction:

- LM task + HF Transformer architecture + Next item-prediction task. 
- We build a [T4RecConfig](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/config/transformer.py#L8) class to update the config class of the transformer architecture with the specified arguments, then load the related model. Here we use it to instantiate an XLNET model according to the  arguments (d_model, n_head, etc.), defining the model architecture.
- [TransformerBlock](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/block/transformer.py#L37) class is created to support HF Transformers for session-based and sequential-based recommendation models.
- [NextItemPredictionTask](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/head.py#L212) is the class to support next item prediction task.

In [11]:
#### Define XLNetConfig class and set default parameters 

# Set HF config of XLNet 
transformer_config = transformer.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=20
)
# Define the model block including: inputs, masking, projection and transformer block.
body = torch4rec.SequentialBlock(
    inputs, torch4rec.MLPBlock([64]), torch4rec.TransformerBlock(transformer=transformer_config, masking=inputs.masking)
)

# Define the head related to next item prediction task 
head = torch4rec.Head(
    body,
    torch4rec.NextItemPredictionTask(weight_tying=True, hf_format=True),
    inputs=inputs,
    
)

# Get the end-to-end Model class 
model = torch4rec.Model(head, device='cuda')

# Training and evaluation

- **Set Training arguments**

In [13]:
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch import Trainer
#Set argumentd for training 
train_args = T4RecTrainingArguments(local_rank = -1, dataloader_drop_last = True, data_loader_engine='nvtabular',
                                  report_to = [], debug = ["r"], gradient_accumulation_steps = 32,
                                  per_device_train_batch_size = 512, per_device_eval_batch_size = 32,
                                  output_dir = ".", use_legacy_prediction_loop = False,
                                  max_sequence_length=20)

- **Define paths to train and eval data**

In [14]:
import glob
from transformers4rec.torch.utils.data_utils import PyarrowDataLoaderBuilder, NVTDataLoaderBuilder
train_data_paths = glob.glob("./preproc_sessions_by_day_ts/*/train.parquet")[:-1]
eval_data_paths = glob.glob("./preproc_sessions_by_day_ts/*/valid.parquet")[:-1]

- **Define Trainer**

In [15]:
# Instantiate the T4Rec Trainer, which manages training and evaluation
trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
    train_dataset_or_path=train_data_paths,
    eval_dataset_or_path=eval_data_paths,
)

- **Train the model**  

In [16]:
trainer.reset_lr_scheduler()
trainer.train()

***** Running training *****
  Num examples = 8704
  Num Epochs = 3
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 16384
  Gradient Accumulation steps = 32
  Total optimization steps = 3


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3, training_loss=5.828135808308919, metrics={'train_runtime': 5.6771, 'train_samples_per_second': 4.228, 'train_steps_per_second': 0.528, 'total_flos': 0.0, 'train_loss': 5.828135808308919, 'epoch': 3.0})

- **Compute evaluation metrics**

In [17]:
eval_metrics = trainer.evaluate(eval_dataset=eval_data_paths, metric_key_prefix='eval')
for key in sorted(eval_metrics.keys()):
    print("  %s = %s" % (key, str(eval_metrics[key])))

  epoch = 3.0
  eval_avgprecisionat_10 = 8.34434395073913e-05
  eval_avgprecisionat_20 = 0.00010814513370860368
  eval_loss = 10.938192367553711
  eval_ndcgat_10 = 9.643646626500413e-05
  eval_ndcgat_20 = 0.00018513661052566022
  eval_recallat_10 = 0.0001390723919030279
  eval_recallat_20 = 0.0004867533571086824
  eval_runtime = 2.1582
  eval_samples_per_second = 385.508
  eval_steps_per_second = 12.047


- **Compute Train metrics**

In [18]:
train_metrics = trainer.evaluate(eval_dataset=train_data_paths, metric_key_prefix='train')
for key in sorted(train_metrics.keys()):
    print("  %s = %s" % (key, str(train_metrics[key])))

  epoch = 3.0
  train_avgprecisionat_10 = 0.0001914666499942541
  train_avgprecisionat_20 = 0.0001975129562197253
  train_loss = 10.93864631652832
  train_ndcgat_10 = 0.00020869127183686942
  train_ndcgat_20 = 0.00023136497475206852
  train_recallat_10 = 0.0002720841730479151
  train_recallat_20 = 0.00036277889739722013
  train_runtime = 1.6463
  train_samples_per_second = 388.755
  train_steps_per_second = 12.149


* **Save the model**

In [19]:
trainer._save_model_and_checkpoint()

Saving model checkpoint to ./checkpoint-3
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


* **Reload model:**

In [20]:
trainer.load_model_trainer_states_from_checkpoint('./checkpoint-3')

- **Re-compute eval metrics of train data**

In [21]:
train_metrics = trainer.evaluate(eval_dataset=train_data_paths, metric_key_prefix='train')
for key in sorted(train_metrics.keys()):
    print("  %s = %s" % (key, str(train_metrics[key])))

  epoch = 3.0
  train_avgprecisionat_10 = 0.0001914666499942541
  train_avgprecisionat_20 = 0.0001975129562197253
  train_loss = 10.93864631652832
  train_ndcgat_10 = 0.00020869127183686942
  train_ndcgat_20 = 0.00023136497475206852
  train_recallat_10 = 0.0002720841730479151
  train_recallat_20 = 0.00036277889739722013
  train_runtime = 2.2422
  train_samples_per_second = 285.43
  train_steps_per_second = 8.92


* **Resume Training** 

In [22]:
# reset lr scheduler to train on new day data
trainer.reset_lr_scheduler()
# set new data from last day
trainer.train_dataset = train_data_paths[-1]
trainer.train(resume_from_checkpoint='./checkpoint-3')

Loading model from ./checkpoint-3).
***** Running training *****
  Num examples = 1024
  Num Epochs = 3
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 16384
  Gradient Accumulation steps = 32
  Total optimization steps = 3
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 3
  Continuing training from global step 3
  Will skip the first 3 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…



Training completed. Do not forget to share your model on huggingface.co/models =)




Step,Training Loss


TrainOutput(global_step=3, training_loss=0.0, metrics={'train_runtime': 0.2376, 'train_samples_per_second': 555.646, 'train_steps_per_second': 12.628, 'total_flos': 0.0, 'train_loss': 0.0, 'epoch': 3.0})

- **Evaluate on last day**

In [23]:
# set new data from last day
eval_metrics = trainer.evaluate(eval_dataset=eval_data_paths[-1], metric_key_prefix='eval')
for key in sorted(eval_metrics.keys()):
    print("  %s = %s" % (key, str(eval_metrics[key])))

  epoch = 3.0
  eval_avgprecisionat_10 = 0.0
  eval_avgprecisionat_20 = 4.933886157232337e-05
  eval_loss = 10.936347961425781
  eval_ndcgat_10 = 0.0
  eval_ndcgat_20 = 0.00020906655117869377
  eval_recallat_10 = 0.0
  eval_recallat_20 = 0.0008880994864739478
  eval_runtime = 0.2439
  eval_samples_per_second = 262.424
  eval_steps_per_second = 8.201
