In [1]:
import os
import torch 
import numpy
import pandas as pd 
import cudf
import cupy
import nvtabular as nvt

### Create random input data similar to pre-processed Yoochoose dataset structure 

In [2]:
NUM_ROWS = 10000
session_length = 20
batch_size = 100
inputs = {
    'session_id': numpy.random.randint(70000, 80000, NUM_ROWS),
    'day': numpy.random.randint(1, 10, NUM_ROWS),
    'item_id': numpy.random.randint(1, 51996, NUM_ROWS),
    'category': numpy.random.randint(0, 332, NUM_ROWS),
    'timestamp/age_days': numpy.random.uniform(0, 1, NUM_ROWS),
    'timestamp/weekday/sin' : numpy.random.uniform(0, 1, NUM_ROWS),
    'purchase': numpy.random.randint(0, 2, NUM_ROWS)
    }
random_data = cudf.DataFrame(inputs)
random_data

Unnamed: 0,session_id,day,item_id,category,timestamp/age_days,timestamp/weekday/sin,purchase
0,77758,5,27127,87,0.610196,0.372406,0
1,75614,7,18540,5,0.832218,0.552607,0
2,78956,6,37940,74,0.771559,0.504475,1
3,70217,8,30688,73,0.913372,0.053951,0
4,72516,6,48452,154,0.032286,0.261013,0
...,...,...,...,...,...,...,...
9995,73903,3,24334,163,0.682913,0.249499,0
9996,74271,5,11657,160,0.461387,0.412575,0
9997,74480,2,49481,94,0.716491,0.203513,1
9998,79750,9,22656,147,0.271502,0.581491,0


### NVTabular workflow

- #TODO : Change the workflow using tagging API once it is finalized  

In [3]:
# Define Groupby Workflow
groupby_features = list(inputs.keys()) >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    aggs={
        "item_id": ["list"],
        "category": ["list"],     
        "day": ["first"],
        "purchase": ["first"],
        "timestamp/age_days": ["list"],
        'timestamp/weekday/sin': ["list"],
        },
    name_sep="-")
# Trim sessions to first 20 items 
groupby_features_nonlist = [x for x in groupby_features.selector if '-list' not in x]
groupby_features_nonlist
groupby_features_trim = ((groupby_features - groupby_features_nonlist)) >> nvt.ops.ListSlice(0,20) >> nvt.ops.Rename(postfix = '_trim')

workflow = nvt.Workflow(groupby_features + groupby_features_trim )
dataset = nvt.Dataset(random_data, cpu=False)
workflow.fit(dataset)
sessions_gdf = workflow.transform(dataset).to_ddf().compute()



In [4]:
sessions_gdf.head(3)

Unnamed: 0,timestamp/weekday/sin-list,session_id,timestamp/age_days-list,day-first,item_id-list,purchase-first,category-list,timestamp/weekday/sin-list_trim,timestamp/age_days-list_trim,item_id-list_trim,category-list_trim
0,[0.3533943546820013],70000,[0.03640205328638679],7,[14574],1,[135],[0.3533943546820013],[0.03640205328638679],[14574],[135]
1,"[0.04203522126107073, 0.3275783399892225]",70002,"[0.005719239287048983, 0.9500323724164531]",3,"[41003, 30571]",0,"[234, 119]","[0.04203522126107073, 0.3275783399892225]","[0.005719239287048983, 0.9500323724164531]","[41003, 30571]","[234, 119]"
2,[0.6206760679502472],70003,[0.8331312174661043],5,[48445],1,[166],[0.6206760679502472],[0.8331312174661043],[48445],[166]


- We can save the workflow.

In [5]:
workflow.save('workflow_inference_test')

### Export pre-processed data by day 

In [6]:
# Convert to a Dataset and write out hive-partitioned data to disk
nvt_output_path_tmp ='./output_nvt_tmp/'
PARTITION_COL = 'day-first'
nvt.Dataset(sessions_gdf).to_parquet(nvt_output_path_tmp, partition_on=[PARTITION_COL])

In [7]:
OUTPUT_FOLDER = "./preproc_sessions_by_day_ts/"
!mkdir -p $OUTPUT_FOLDER
days_folders = [f for f in sorted(os.listdir(nvt_output_path_tmp)) if f.startswith(PARTITION_COL)]
for day_folder in days_folders:
    df = cudf.read_parquet(os.path.join(nvt_output_path_tmp, day_folder))
    out_folder = os.path.join(OUTPUT_FOLDER, day_folder.replace('day-first=', ''))
    os.makedirs(out_folder, exist_ok=True)
    df.to_parquet(os.path.join(out_folder, 'train.parquet'))
    
    random_values = cupy.random.rand(len(df))
    
    #Extracts 10% for valid and test set. Those sessions are also in the train set, but as evaluation
    #happens only for the subsequent day of training, that is not an issue, and we can keep the train set larger.
    valid_set = df[random_values <= 0.10]
    valid_set.to_parquet(os.path.join(out_folder, 'valid.parquet'))
    
    test_set = df[random_values >= 0.90]
    test_set.to_parquet(os.path.join(out_folder, 'test.parquet'))

# Transformers4rec model  

In [9]:
import torch 
import transformers4rec.torch as torch4rec
from transformers4rec.torch import SequentialTabularFeatures, MLPBlock, SequentialBlock, Head, TransformerBlock

from transformers4rec.utils.schema import DatasetSchema

from transformers4rec.torch.head import NextItemPredictionTask

from transformers4rec.config import transformer
from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt

- Manually set the schema 

In [10]:
# Define schema object to pass it to the SequentialTabularFeatures
schema = DatasetSchema.from_schema("schema.pb")

### Define the sequential input module

Below we define our `input` bloc using [`SequentialTabularFeatures` class](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/features/sequential.py). The `from_schema` module directly parse schema and accepts categorical and continuous sequential inputs and supports data augmentation, data aggregation, `sequential-concat` and `elementwise-sum` aggregations, the projection of the interaction embeddings and the masking tasks.

`max_sequence_length` defines the maximum sequence length of our sequential input, and if `continuous_projection` argument is set,  all numerical features are concatenated and projected by a number of MLP layers.

In [11]:
inputs = SequentialTabularFeatures.from_schema(
        schema,
        max_sequence_length=20,
        continuous_projection=64,
        d_output=100,
        masking="causal",
    )

inputs.masking.device = 'cuda'

### End-to-end session-based Transformer-based model for item prediction:

- LM task + HF Transformer architecture + Next item-prediction task. 
- We build a [T4RecConfig](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/config/transformer.py#L8) class to update the config class of the transformer architecture with the specified arguments, then load the related model. Here we use it to instantiate an XLNET model according to the  arguments (d_model, n_head, etc.), defining the model architecture.
- [TransformerBlock](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/block/transformer.py#L37) class is created to support HF Transformers for session-based and sequential-based recommendation models.
- [NextItemPredictionTask](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/head.py#L212) is the class to support next item prediction task.

In [12]:
# case-1: Define XLNetConfig class and set default parameters 

transformer_config = transformer.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=20
)

body = torch4rec.SequentialBlock(
    inputs, torch4rec.MLPBlock([64]), torch4rec.TransformerBlock(transformer=transformer_config, masking=inputs.masking)
)

head = torch4rec.Head(
    body,
    torch4rec.NextItemPredictionTask(weight_tying=True, hf_format=True),
    inputs=inputs,
)
model = torch4rec.Model(head)

# Train the model 

- Load arguments

In [13]:
from transformers4rec.recsys_args import DataArguments, ModelArguments, TrainingArguments

TrainingArguments.local_rank = -1
TrainingArguments.world_size = 1
TrainingArguments.dataloader_drop_last = True
TrainingArguments.device = "cuda"
TrainingArguments.report_to = []
TrainingArguments.debug = ["r"]
TrainingArguments.n_gpu = 1
TrainingArguments.gradient_accumulation_steps = 32
TrainingArguments.train_batch_size = 512
TrainingArguments.per_device_train_batch_size = 512
TrainingArguments.per_device_eval_batch_size = 512
TrainingArguments.output_dir = ""
TrainingArguments.world_size = 1


DataArguments.data_path = "./preproc_sessions_by_day_ts/"
DataArguments.data_loader_engine = "nvtabular"

- Load data using old NVTabular dataloader  

In [14]:
import glob
# NVTabular dependencies
from nvtabular import Dataset as NVTDataset
from nvtabular.loader.torch import DLDataLoader
from nvtabular.loader.torch import TorchAsyncItr as NVTDataLoader
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader

SESSION_LENGTH_MAX = 20

x_cat_names, x_cont_names = ['item_id-list_trim', 'category-list_trim'], ['timestamp/weekday/sin-list_trim','timestamp/age_days-list_trim']

sparse_features_max = {
    fname: SESSION_LENGTH_MAX
    for fname in x_cat_names + x_cont_names
}

train_data_paths = glob.glob("./preproc_sessions_by_day_ts/*/train.parquet")
train_dataset = NVTDataset(
    train_data_paths,
    engine="parquet",
)

def dataloader_collate_dict(inputs):
    # Gets only the features dict
    inputs = inputs[0][0]
    return inputs

class DLDataLoaderWrapper(DLDataLoader):
    def __init__(self, *args, **kwargs) -> None:
        if "batch_size" in kwargs:
            # Setting the batch size directly to DLDataLoader makes it 3x slower. 
            # So we set as an alternative attribute and use it within RecSysTrainer during evaluation
            self._batch_size = kwargs.pop("batch_size")
        super().__init__(*args, **kwargs)


In [15]:
loader = NVTDataLoader(
    dataset=train_dataset,
    batch_size=TrainingArguments.train_batch_size,
    shuffle=False,
    cats=x_cat_names,
    conts=x_cont_names,
    device=0,
    labels=[],
    sparse_names=x_cat_names + x_cont_names,
    sparse_max=sparse_features_max,
    sparse_as_dense=True,
    drop_last=False,
)
dl_loader = DLDataLoaderWrapper(
    loader, collate_fn=dataloader_collate_dict, batch_size=TrainingArguments.train_batch_size
    )
out = next(iter(dl_loader))

- Test the output of the model

In [None]:
out['item_id-list_trim'].shape

In [17]:
out

{'item_id-list_trim': tensor([[24401,     0,     0,  ...,     0,     0,     0],
         [ 1028,     0,     0,  ...,     0,     0,     0],
         [ 3865,     0,     0,  ...,     0,     0,     0],
         ...,
         [30488, 17763,     0,  ...,     0,     0,     0],
         [33885, 37891,     0,  ...,     0,     0,     0],
         [22565,  3352,     0,  ...,     0,     0,     0]], device='cuda:0'),
 'category-list_trim': tensor([[138,   0,   0,  ...,   0,   0,   0],
         [ 91,   0,   0,  ...,   0,   0,   0],
         [100,   0,   0,  ...,   0,   0,   0],
         ...,
         [295, 273,   0,  ...,   0,   0,   0],
         [135, 223,   0,  ...,   0,   0,   0],
         [107, 114,   0,  ...,   0,   0,   0]], device='cuda:0'),
 'timestamp/weekday/sin-list_trim': tensor([[0.0753, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.8018, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.5628, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0

In [18]:
import glob
# NVTabular dependencies
from nvtabular import Dataset as NVTDataset
from nvtabular.loader.torch import DLDataLoader
from nvtabular.loader.torch import TorchAsyncItr as NVTDataLoader
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader


SESSION_LENGTH_MAX = 20

x_cat_names, x_cont_names = ['item_id-list_trim', 'category-list_trim'], ['timestamp/weekday/sin-list_trim','timestamp/age_days-list_trim']

sparse_features_max = {
    fname: SESSION_LENGTH_MAX
    for fname in x_cat_names + x_cont_names
}

train_data_paths = glob.glob("./preproc_sessions_by_day_ts/*/train.parquet")
train_dataset = NVTDataset(
    train_data_paths,
    engine="parquet",
)

def dataloader_collate_dict(inputs):
    # Gets only the features dict
    inputs = inputs[0][0]
    return inputs

class DLDataLoaderWrapper(DLDataLoader):
    def __init__(self, *args, **kwargs) -> None:
        if "batch_size" in kwargs:
            self._batch_size = kwargs.pop("batch_size")
        super().__init__(*args, **kwargs)

We define wrapper that is needed to prepare the model’s inputs in the format required by [HuggingFace Trainer](https://github.com/huggingface/transformers).

In [19]:
class HFWrapper(torch.nn.Module): 
    def __init__(self, model): 
        super().__init__()
        self.model = model 

    def forward(self, *args, **kwargs):
        inputs = kwargs
        return model(inputs)

In [20]:
model_wp = HFWrapper(model)

In [30]:
model_wp

HFWrapper(
  (model): Model(
    (heads): ModuleList(
      (0): Head(
        (body): SequentialBlock(
          (0): SequentialTabularFeatures(
            (to_merge): ModuleDict(
              (continuous_module): SequentialBlock(
                (0): ContinuousFeatures(
                  (filter_features): FilterFeatures()
                  (_aggregation): SequentialConcatFeatures()
                )
                (1): SequentialBlock(
                  (0): DenseBlock(
                    (0): Linear(in_features=1, out_features=64, bias=True)
                    (1): ReLU(inplace=True)
                  )
                )
                (2): AsTabular()
              )
              (categorical_module): SequentialEmbeddingFeatures(
                (filter_features): FilterFeatures()
                (embedding_tables): ModuleDict(
                  (category-list_trim): Embedding(332, 64, padding_idx=0)
                  (item_id-list_trim): Embedding(51996, 64, padding_idx=0)

## Training 

- Basic fit  and evaluate to test the model training:

In [22]:
# Instantiate the RecSysTrainer, which manages training and evaluation
from transformers4rec.recsys_trainer import RecSysTrainer, DatasetType

trainer = RecSysTrainer(
    model=model_wp,
    args=TrainingArguments,
    model_args=ModelArguments,
    data_args=DataArguments,
)

- Fit the model : 

In [23]:
trainer.set_train_dataloader(dl_loader)
trainer.reset_lr_scheduler()
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=3, training_loss=16.449358622233074, metrics={'train_runtime': 1.4791, 'train_samples_per_second': 2.028, 'total_flos': 0.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 14147584, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 7831552, 'train_mem_gpu_alloc_delta': 41794048, 'train_mem_cpu_peaked_delta': 192512, 'train_mem_gpu_peaked_delta': 441261568})

- Evaluate the model: here we evaluate the model using training set.

In [28]:
trainer.set_eval_dataloader(dl_loader)
train_metrics = trainer.evaluate(metric_key_prefix=DatasetType.train.value)
for key in sorted(train_metrics.keys()):
    print("  %s = %s" % (key, str(train_metrics[key])))

Not all data has been set. Are you sure you passed all values?
Not all data has been set. Are you sure you passed all values?


  epoch = 3.0
  eval_mem_cpu_alloc_delta = 4096
  eval_mem_cpu_peaked_delta = 184320
  eval_mem_gpu_alloc_delta = 0
  eval_mem_gpu_peaked_delta = 289772544
  train_avg_precision@10 = 0.0
  train_avg_precision@1000 = 8.79675315865405e-05
  train_avg_precision@20 = 2.249212971387001e-05
  train_loss = 34.15816409771259
  train_ndcg@10 = 0.0
  train_ndcg@1000 = 0.002303272736473725
  train_ndcg@20 = 7.293877514222494e-05
  train_precision@10 = 0.0
  train_precision@1000 = 1.9401000827201642e-05
  train_precision@20 = 1.349527715669515e-05
  train_recall@10 = 0.0
  train_recall@1000 = 0.01940100109921052
  train_recall@20 = 0.0002699055386563906
  train_runtime = 3.9485
  train_samples_per_second = 2593.362
