In [1]:
import os
import torch 
import numpy
import pandas as pd 
import cudf
import cupy
import nvtabular as nvt

### Create random input similar to pre-processed Yoochoose dataset structure 

In [2]:
NUM_ROWS = 10000
session_length = 20
batch_size = 100
inputs = {
    'session_id': numpy.random.randint(70000, 80000, NUM_ROWS),
    'day': numpy.random.randint(1, 10, NUM_ROWS),
    'item_id': numpy.random.randint(1, 51996, NUM_ROWS),
    'category': numpy.random.randint(0, 332, NUM_ROWS),
    'timestamp/age_days': numpy.random.uniform(0, 1, NUM_ROWS),
    'timestamp/weekday/sin' : numpy.random.uniform(0, 1, NUM_ROWS),
    'purchase': numpy.random.randint(0, 2, NUM_ROWS)
    }
random_data = cudf.DataFrame(inputs)
random_data

Unnamed: 0,session_id,day,item_id,category,timestamp/age_days,timestamp/weekday/sin,purchase
0,76430,5,25517,260,0.676600,0.436413,1
1,75338,5,5870,295,0.601036,0.470999,1
2,77142,8,36169,311,0.560507,0.534578,1
3,76523,1,43690,266,0.103528,0.153887,0
4,76827,1,50699,313,0.371085,0.739300,1
...,...,...,...,...,...,...,...
9995,72601,2,39736,72,0.010964,0.740469,0
9996,76877,9,45391,103,0.033774,0.052996,0
9997,72260,8,23967,308,0.869237,0.094876,1
9998,75266,6,48404,240,0.463385,0.360476,0


### NVTabular workflow

- #TODO : Change the workflow using tagging API once it is finalized  

In [3]:
# Define Groupby Workflow
groupby_features = list(inputs.keys()) >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    aggs={
        "item_id": ["list"],
        "category": ["list"],     
        "day": ["first"],
        "purchase": ["first"],
        "timestamp/age_days": ["list"],
        'timestamp/weekday/sin': ["list"],
        },
    name_sep="-")
# Trim sessions to first 20 items 
groupby_features_nonlist = [x for x in groupby_features.selector if '-list' not in x]
groupby_features_nonlist
groupby_features_trim = ((groupby_features - groupby_features_nonlist)) >> nvt.ops.ListSlice(0,20) >> nvt.ops.Rename(postfix = '_trim')

workflow = nvt.Workflow(groupby_features + groupby_features_trim )
dataset = nvt.Dataset(random_data, cpu=False)
workflow.fit(dataset)
sessions_gdf = workflow.transform(dataset).to_ddf().compute()



In [4]:
sessions_gdf.head(3)

Unnamed: 0,day-first,timestamp/age_days-list,timestamp/weekday/sin-list,purchase-first,category-list,item_id-list,session_id,timestamp/age_days-list_trim,timestamp/weekday/sin-list_trim,category-list_trim,item_id-list_trim
0,8,[0.5554684822734157],[0.1325290970584656],0,[331],[33190],70000,[0.5554684822734157],[0.1325290970584656],[331],[33190]
1,6,"[0.19969787990764942, 0.5538677006954508, 0.70...","[0.40387194895713885, 0.5651163370425829, 0.08...",1,"[157, 57, 298]","[14923, 42290, 43953]",70002,"[0.19969787990764942, 0.5538677006954508, 0.70...","[0.40387194895713885, 0.5651163370425829, 0.08...","[157, 57, 298]","[14923, 42290, 43953]"
2,6,[0.46310031101115046],[0.13407156620578908],1,[76],[47985],70004,[0.46310031101115046],[0.13407156620578908],[76],[47985]


In [5]:
#workflow.save('workflow_inference_test')

### Export pre-processed data by day 

In [6]:
# Convert to a Datset and write out hive-partitioned data to disk
nvt_output_path_tmp ='./output_nvt_tmp/'
PARTITION_COL = 'day-first'
nvt.Dataset(sessions_gdf).to_parquet(nvt_output_path_tmp, partition_on=[PARTITION_COL])

In [7]:
OUTPUT_FOLDER = "./preproc_sessions_by_day_ts/"
!mkdir -p $OUTPUT_FOLDER
days_folders = [f for f in sorted(os.listdir(nvt_output_path_tmp)) if f.startswith(PARTITION_COL)]
for day_folder in days_folders:
    df = cudf.read_parquet(os.path.join(nvt_output_path_tmp, day_folder))
    print(len(df))
    
    out_folder = os.path.join(OUTPUT_FOLDER, day_folder.replace('day-first=', ''))
    os.makedirs(out_folder, exist_ok=True)
    df.to_parquet(os.path.join(out_folder, 'train.parquet'))
    
    random_values = cupy.random.rand(len(df))
    
    #Extracts 10% for valid and test set. Those sessions are also in the train set, but as evaluation
    #happens only for the subsequent day of training, that is not an issue, and we can keep the train set larger.
    valid_set = df[random_values <= 0.10]
    valid_set.to_parquet(os.path.join(out_folder, 'valid.parquet'))
    
    test_set = df[random_values >= 0.90]
    test_set.to_parquet(os.path.join(out_folder, 'test.parquet'))

694
670
687
713
721
755
685
687
708


# Transformers4rec model  

- Manually set the schema 

In [8]:
schema_file = 'schema.pb'

### Define the sequential input module

In [9]:
import torch 
import transformers4rec.torch as torch4rec
from transformers4rec.torch import SequentialTabularFeatures, MLPBlock, SequentialBlock, Head, TransformerBlock

from transformers4rec.utils.schema import DatasetSchema

from transformers4rec.torch.head import NextItemPredictionTask

from transformers4rec.config import transformer
from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt

In [10]:
'''

import transformers4rec.torch as torch4rec

inputs.masking.device = 'cuda'
transformer_config = transformer.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=20
)
body = torch4rec.SequentialBlock(
    inputs, torch4rec.MLPBlock([64]), torch4rec.TransformerBlock(transformer=transformer_config, masking=inputs.masking)
)

head = torch4rec.Head(
    body,
    torch4rec.NextItemPredictionTask(weight_tying=True, hf_format=True),
    inputs=inputs,
)
model = torch4rec.Model(head)
'''

"\n\nimport transformers4rec.torch as torch4rec\n\ninputs.masking.device = 'cuda'\ntransformer_config = transformer.XLNetConfig.build(\n    d_model=64, n_head=4, n_layer=2, total_seq_length=20\n)\nbody = torch4rec.SequentialBlock(\n    inputs, torch4rec.MLPBlock([64]), torch4rec.TransformerBlock(transformer=transformer_config, masking=inputs.masking)\n)\n\nhead = torch4rec.Head(\n    body,\n    torch4rec.NextItemPredictionTask(weight_tying=True, hf_format=True),\n    inputs=inputs,\n)\nmodel = torch4rec.Model(head)\n"

In [11]:

# TO be removed once from_schema is defined: To be removed by directly reading the schema and passing it SequentialTabularFeatures: 
schema = DatasetSchema.from_schema("schema.pb")

In [12]:
# Init the tabular module that converts the inputs and aggregate them into one single interaction tensor 
# the current supported aggregations are: sequential_concat and element-wise-sum

inputs = SequentialTabularFeatures.from_schema(
        schema,
        max_sequence_length=20,
        continuous_projection=64,
        d_output=100,
        masking="causal",
    )

inputs.masking.device = 'cuda'

### End-to-end session-based Transformer-based model for item prediction:
    - LM task + HF Transformer architecture + item-prediction task 

   * 1. Define the Transformer block :  LM task + HF Transformer architecture

In [13]:
# case-1: Define XLNetConfig class and set default parameters 

transformer_config = transformer.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=20
)

body = torch4rec.SequentialBlock(
    inputs, torch4rec.MLPBlock([64]), torch4rec.TransformerBlock(transformer=transformer_config, masking=inputs.masking)
)


head = torch4rec.Head(
    body,
    torch4rec.NextItemPredictionTask(weight_tying=True, hf_format=True),
    inputs=inputs,
)
model = torch4rec.Model(head)

#model

## [To review]:  Define Item prediction task

- The sequential item prediction task includes two formats of output:        
     * **HF_format=True:** it returns the dictionary expected by HF recsys_trainer. 
     
     {
        "loss": loss,
        "labels": labels_all,
        "predictions": x,
        "pred_metadata": {},
        "model_outputs": []
    }
 
 *N.B : pred_metadata and model_outputs still need to be implemented*
     
     * **HF_format=False:** it returns the tensor of predictions.


- The option of **weight tying** is also included 

# Train the model 

### Load data using old NVTabular dataloader  

In [14]:
from transformers4rec.recsys_args import DataArguments, ModelArguments, TrainingArguments

TrainingArguments.local_rank = -1
TrainingArguments.world_size = 1
TrainingArguments.dataloader_drop_last = True
TrainingArguments.device = "cuda"
TrainingArguments.report_to = []
TrainingArguments.debug = ["r"]
TrainingArguments.n_gpu = 1
TrainingArguments.gradient_accumulation_steps = 32
TrainingArguments.train_batch_size = 512
TrainingArguments.per_device_train_batch_size = 512
TrainingArguments.per_device_eval_batch_size = 512
TrainingArguments.output_dir = ""
TrainingArguments.world_size = 1


DataArguments.data_path = "./preproc_sessions_by_day_ts/"
DataArguments.data_loader_engine = "nvtabular"

In [15]:
import glob
# NVTabular dependencies
from nvtabular import Dataset as NVTDataset
from nvtabular.loader.torch import DLDataLoader
from nvtabular.loader.torch import TorchAsyncItr as NVTDataLoader
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader

# BATCH_SIZE = 16
SESSION_LENGTH_MAX = 20

x_cat_names, x_cont_names = ['item_id-list_trim', 'category-list_trim'], ['timestamp/weekday/sin-list_trim','timestamp/age_days-list_trim']

sparse_features_max = {
    fname: SESSION_LENGTH_MAX
    for fname in x_cat_names + x_cont_names
}

train_data_paths = glob.glob("./preproc_sessions_by_day_ts/*/train.parquet")
train_dataset = NVTDataset(
    train_data_paths,
    engine="parquet",
)

def dataloader_collate_dict(inputs):
    # Gets only the features dict
    inputs = inputs[0][0]
    return inputs

class DLDataLoaderWrapper(DLDataLoader):
    def __init__(self, *args, **kwargs) -> None:
        if "batch_size" in kwargs:
            # Setting the batch size directly to DLDataLoader makes it 3x slower. 
            # So we set as an alternative attribute and use it within RecSysTrainer during evaluation
            self._batch_size = kwargs.pop("batch_size")
        super().__init__(*args, **kwargs)


In [17]:
loader = NVTDataLoader(
    dataset=train_dataset,
    batch_size=TrainingArguments.train_batch_size,
    shuffle=False,
    cats=x_cat_names,
    conts=x_cont_names,
    device=0,
    labels=[],
    sparse_names=x_cat_names + x_cont_names,
    sparse_max=sparse_features_max,
    sparse_as_dense=True,
    drop_last=False,
)
dl_loader = DLDataLoaderWrapper(
    loader, collate_fn=dataloader_collate_dict, batch_size=TrainingArguments.train_batch_size
    )
out = next(iter(dl_loader))

In [18]:
out['item_id-list_trim'].shape

torch.Size([512, 20])

In [19]:
out

{'item_id-list_trim': tensor([[31296,     0,     0,  ...,     0,     0,     0],
         [29195,     0,     0,  ...,     0,     0,     0],
         [48996, 30165,     0,  ...,     0,     0,     0],
         ...,
         [19036, 47548,     0,  ...,     0,     0,     0],
         [ 6137, 40418,     0,  ...,     0,     0,     0],
         [19656, 16276,     0,  ...,     0,     0,     0]], device='cuda:0'),
 'category-list_trim': tensor([[214,   0,   0,  ...,   0,   0,   0],
         [230,   0,   0,  ...,   0,   0,   0],
         [ 58, 319,   0,  ...,   0,   0,   0],
         ...,
         [207, 120,   0,  ...,   0,   0,   0],
         [ 55,  73,   0,  ...,   0,   0,   0],
         [258,  39,   0,  ...,   0,   0,   0]], device='cuda:0'),
 'timestamp/weekday/sin-list_trim': tensor([[0.3474, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.5581, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.5541, 0.2348, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0

- Test the output of the model : 


In [20]:
import glob
# NVTabular dependencies
from nvtabular import Dataset as NVTDataset
from nvtabular.loader.torch import DLDataLoader
from nvtabular.loader.torch import TorchAsyncItr as NVTDataLoader
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader


SESSION_LENGTH_MAX = 20

x_cat_names, x_cont_names = ['item_id-list_trim', 'category-list_trim'], ['timestamp/weekday/sin-list_trim','timestamp/age_days-list_trim']

sparse_features_max = {
    fname: SESSION_LENGTH_MAX
    for fname in x_cat_names + x_cont_names
}

train_data_paths = glob.glob("./preproc_sessions_by_day_ts/*/train.parquet")
train_dataset = NVTDataset(
    train_data_paths,
    engine="parquet",
)

def dataloader_collate_dict(inputs):
    # Gets only the features dict
    inputs = inputs[0][0]
    return inputs

class DLDataLoaderWrapper(DLDataLoader):
    def __init__(self, *args, **kwargs) -> None:
        if "batch_size" in kwargs:
            # Setting the batch size directly to DLDataLoader makes it 3x slower. 
            # So we set as an alternative attribute and use it within RecSysTrainer during evaluation
            self._batch_size = kwargs.pop("batch_size")
        super().__init__(*args, **kwargs)


In [21]:
class HFWrapper(torch.nn.Module): 
    def __init__(self, model): 
        super().__init__()
        self.model = model 

    def forward(self, *args, **kwargs):
        inputs = kwargs
        return model(inputs)

In [22]:
model_wp = HFWrapper(model)

In [26]:
#model_wp

## Training 

- Basic fit  and evaluate to test the model training:

- Load arguments:  
    N.B: These classes will be updated to keep only what required by recsys_trainer

In [23]:
# Instantiate the RecSysTrainer, which manages training and evaluation
from transformers4rec.recsys_trainer import RecSysTrainer, DatasetType

trainer = RecSysTrainer(
    model=model_wp,
    args=TrainingArguments,
    model_args=ModelArguments,
    data_args=DataArguments,
)

- Fit the model : 

In [24]:
trainer.set_train_dataloader(dl_loader)
trainer.reset_lr_scheduler()
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=3, training_loss=16.240479787190754, metrics={'train_runtime': 2.0837, 'train_samples_per_second': 1.44, 'train_steps_per_second': 1.44, 'total_flos': 0.0, 'train_loss': 16.240479787190754, 'epoch': 3.0})

- Evaluate the model : 
    - N.B: evaluate model will be updated to take into account metrics defined in the prediction_head

In [None]:
# trainer.set_eval_dataloader(dl_loader)
# train_metrics = trainer.evaluate(metric_key_prefix=DatasetType.train.value)
# for key in sorted(train_metrics.keys()):
#     print("  %s = %s" % (key, str(train_metrics[key])))