In [1]:
import os
import torch 
import numpy
import pandas as pd 
import cudf
import cupy
import nvtabular as nvt

### Create random input similar to pre-processed Yoochoose dataset structure 

In [2]:
NUM_ROWS = 10000
session_length = 20
batch_size = 100
inputs = {
    'session_id': numpy.random.randint(70000, 80000, NUM_ROWS),
    'day': numpy.random.randint(1, 10, NUM_ROWS),
    'item_id': numpy.random.randint(1, 51996, NUM_ROWS),
    'category': numpy.random.randint(0, 332, NUM_ROWS),
    'timestamp/age_days': numpy.random.uniform(0, 1, NUM_ROWS),
    'timestamp/weekday/sin' : numpy.random.uniform(0, 1, NUM_ROWS),
    'purchase': numpy.random.randint(0, 2, NUM_ROWS)
    }
random_data = cudf.DataFrame(inputs)
random_data

Unnamed: 0,session_id,day,item_id,category,timestamp/age_days,timestamp/weekday/sin,purchase
0,75584,1,13959,228,0.188646,0.251946,0
1,70373,5,22671,112,0.958216,0.398291,0
2,75911,8,7628,100,0.620943,0.566578,0
3,77271,2,11962,181,0.835652,0.360247,1
4,77767,1,32419,130,0.583954,0.020455,0
...,...,...,...,...,...,...,...
9995,75128,3,46371,217,0.608423,0.059013,1
9996,71278,4,34712,135,0.811443,0.112605,1
9997,74048,7,1811,118,0.202478,0.415030,0
9998,75250,7,1093,161,0.884268,0.878341,1


### NVTabular workflow

- #TODO : Change the workflow using tagging API once it is finalized  

In [3]:
# Define Groupby Workflow
groupby_features = list(inputs.keys()) >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    aggs={
        "item_id": ["list"],
        "category": ["list"],     
        "day": ["first"],
        "purchase": ["first"],
        "timestamp/age_days": ["list"],
        'timestamp/weekday/sin': ["list"],
        },
    name_sep="-")
# Trim sessions to first 20 items 
groupby_features_nonlist = [x for x in groupby_features.selector if '-list' not in x]
groupby_features_nonlist
groupby_features_trim = ((groupby_features - groupby_features_nonlist)) >> nvt.ops.ListSlice(0,20) >> nvt.ops.Rename(postfix = '_trim')

workflow = nvt.Workflow(groupby_features + groupby_features_trim )
dataset = nvt.Dataset(random_data, cpu=False)
workflow.fit(dataset)
sessions_gdf = workflow.transform(dataset).to_ddf().compute()



In [4]:
sessions_gdf.head(3)

Unnamed: 0,session_id,day-first,category-list,purchase-first,timestamp/weekday/sin-list,timestamp/age_days-list,item_id-list,category-list_trim,timestamp/weekday/sin-list_trim,timestamp/age_days-list_trim,item_id-list_trim
0,70000,1,[9],0,[0.16624953636898254],[0.7724655470980988],[18748],[9],[0.16624953636898254],[0.7724655470980988],[18748]
1,70001,8,[135],0,[0.015624112985417216],[0.032640360108273314],[3652],[135],[0.015624112985417216],[0.032640360108273314],[3652]
2,70002,2,"[30, 228]",0,"[0.8508864494735382, 0.32605929617141827]","[0.11044592975969747, 0.7642976814488108]","[7308, 766]","[30, 228]","[0.8508864494735382, 0.32605929617141827]","[0.11044592975969747, 0.7642976814488108]","[7308, 766]"


In [5]:
#workflow.save('workflow_inference_test')

### Export pre-processed data by day 

In [6]:
# Convert to a Datset and write out hive-partitioned data to disk
nvt_output_path_tmp ='./output_nvt_tmp/'
PARTITION_COL = 'day-first'
nvt.Dataset(sessions_gdf).to_parquet(nvt_output_path_tmp, partition_on=[PARTITION_COL])

In [7]:
OUTPUT_FOLDER = "./preproc_sessions_by_day_ts/"
!mkdir -p $OUTPUT_FOLDER
days_folders = [f for f in sorted(os.listdir(nvt_output_path_tmp)) if f.startswith(PARTITION_COL)]
for day_folder in days_folders:
    df = cudf.read_parquet(os.path.join(nvt_output_path_tmp, day_folder))
#     print(len(df))
    
    out_folder = os.path.join(OUTPUT_FOLDER, day_folder.replace('day-first=', ''))
    os.makedirs(out_folder, exist_ok=True)
    df.to_parquet(os.path.join(out_folder, 'train.parquet'))
    
    random_values = cupy.random.rand(len(df))
    
    #Extracts 10% for valid and test set. Those sessions are also in the train set, but as evaluation
    #happens only for the subsequent day of training, that is not an issue, and we can keep the train set larger.
    valid_set = df[random_values <= 0.10]
    valid_set.to_parquet(os.path.join(out_folder, 'valid.parquet'))
    
    test_set = df[random_values >= 0.90]
    test_set.to_parquet(os.path.join(out_folder, 'test.parquet'))

# Transformers4rec model  

- Manually set the schema 

In [8]:
import torch 
import transformers4rec.torch as torch4rec
from transformers4rec.torch import TabularSequenceFeatures, MLPBlock, SequentialBlock, Head, TransformerBlock

from transformers4rec.utils.schema import DatasetSchema

from transformers4rec.torch.head import NextItemPredictionTask

from transformers4rec.config import transformer
from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt

In [9]:
schema = DatasetSchema.from_schema("schema.pb")

# if I do the following it works.
# schema = schema.select_by_name(['item_id-list_trim'])

In [10]:
# Init the tabular module that converts the inputs and aggregate them into one single interaction tensor 
# the current supported aggregations are: sequential_concat and element-wise-sum

inputs = TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=20,
        d_output=100,
        masking="causal",
    )

inputs.masking.device = 'cuda'

   * 1. Define the Transformer block :  LM task + HF Transformer architecture

In [11]:
# # case-1: Define XLNetConfig class and set default parameters 

transformer_config = transformer.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=20
)

body = torch4rec.SequentialBlock(
    inputs, torch4rec.MLPBlock([64]), torch4rec.TransformerBlock(transformer=transformer_config, masking=inputs.masking)
)


head = torch4rec.Head(
    body,
    torch4rec.NextItemPredictionTask(weight_tying=True, hf_format=True),
    inputs=inputs,
)
model = torch4rec.Model(head)

#model

# Train the model 

### Load data using old NVTabular dataloader  

In [12]:
from transformers4rec.recsys_args import DataArguments, ModelArguments, TrainingArguments

TrainingArguments.local_rank = -1
TrainingArguments.world_size = 1
TrainingArguments.dataloader_drop_last = True
TrainingArguments.device = "cuda"
TrainingArguments.report_to = []
TrainingArguments.debug = ["r"]
TrainingArguments.n_gpu = 1
TrainingArguments.gradient_accumulation_steps = 32
TrainingArguments.train_batch_size = 512
TrainingArguments.per_device_train_batch_size = 512
TrainingArguments.per_device_eval_batch_size = 512
TrainingArguments.output_dir = ""
TrainingArguments.world_size = 1


DataArguments.data_path = "./preproc_sessions_by_day_ts/"
DataArguments.data_loader_engine = "nvtabular"

In [13]:
import glob
# NVTabular dependencies
from nvtabular import Dataset as NVTDataset
from nvtabular.loader.torch import DLDataLoader
from nvtabular.loader.torch import TorchAsyncItr as NVTDataLoader
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader

# BATCH_SIZE = 16
SESSION_LENGTH_MAX = 20

# x_cat_names, x_cont_names = ['item_id-list_trim', 'category-list_trim'], ['timestamp/weekday/sin-list_trim','timestamp/age_days-list_trim']

x_cat_names, x_cont_names = ['item_id-list_trim'], []

sparse_features_max = {
    fname: SESSION_LENGTH_MAX
    for fname in x_cat_names + x_cont_names
}

train_data_paths = glob.glob("./preproc_sessions_by_day_ts/*/train.parquet")
train_dataset = NVTDataset(
    train_data_paths,
    engine="parquet",
)

def dataloader_collate_dict(inputs):
    # Gets only the features dict
    inputs = inputs[0][0]
    return inputs

class DLDataLoaderWrapper(DLDataLoader):
    def __init__(self, *args, **kwargs) -> None:
        if "batch_size" in kwargs:
            # Setting the batch size directly to DLDataLoader makes it 3x slower. 
            # So we set as an alternative attribute and use it within RecSysTrainer during evaluation
            self._batch_size = kwargs.pop("batch_size")
        super().__init__(*args, **kwargs)


In [14]:
loader = NVTDataLoader(
    dataset=train_dataset,
    batch_size=TrainingArguments.train_batch_size,
    shuffle=False,
    cats=x_cat_names,
    conts=x_cont_names,
    device=0,
    labels=[],
    sparse_names=x_cat_names + x_cont_names,
    sparse_max=sparse_features_max,
    sparse_as_dense=True,
    drop_last=False,
)
dl_loader = DLDataLoaderWrapper(
    loader, collate_fn=dataloader_collate_dict, batch_size=TrainingArguments.train_batch_size
    )
out = next(iter(dl_loader))

In [15]:
loader = NVTDataLoader(
    dataset=train_dataset,
    batch_size=TrainingArguments.train_batch_size,
    shuffle=False,
    cats=x_cat_names,
    conts=x_cont_names,
    device=0,
    labels=[],
    sparse_names=x_cat_names + x_cont_names,
    sparse_max=sparse_features_max,
    sparse_as_dense=True,
    drop_last=False,
)
dl_loader = DLDataLoaderWrapper(
    loader, collate_fn=dataloader_collate_dict, batch_size=TrainingArguments.train_batch_size
    )
out = next(iter(dl_loader))

In [16]:
out['item_id-list_trim'].shape

torch.Size([512, 20])

In [17]:
out

{'item_id-list_trim': tensor([[18748,     0,     0,  ...,     0,     0,     0],
         [12856,  5574,     0,  ...,     0,     0,     0],
         [43739, 37018,     0,  ...,     0,     0,     0],
         ...,
         [34889, 44216,     0,  ...,     0,     0,     0],
         [45364,     0,     0,  ...,     0,     0,     0],
         [  215, 21723, 16622,  ...,     0,     0,     0]], device='cuda:0')}

- Test the output of the model : 


In [18]:
class HFWrapper(torch.nn.Module): 
    def __init__(self, model): 
        super().__init__()
        self.model = model 

    def forward(self, *args, **kwargs):
        inputs = kwargs
        return model(inputs)

In [19]:
model_wp = HFWrapper(model)

In [20]:
#model_wp

In [21]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

Model's state_dict:
heads.0.body.0.to_merge.categorical_module.embedding_tables.category-list_trim.weight 	 torch.Size([332, 64])
heads.0.body.0.to_merge.categorical_module.embedding_tables.item_id-list_trim.weight 	 torch.Size([51996, 64])
heads.0.body.0.projection_module.0.0.weight 	 torch.Size([100, 129])
heads.0.body.0.projection_module.0.0.bias 	 torch.Size([100])
heads.0.body.0.masking.masked_item_embedding 	 torch.Size([100])
heads.0.body.1.0.0.weight 	 torch.Size([64, 100])
heads.0.body.1.0.0.bias 	 torch.Size([64])
heads.0.body.2.transformer.mask_emb 	 torch.Size([1, 1, 64])
heads.0.body.2.transformer.word_embedding.weight 	 torch.Size([1, 64])
heads.0.body.2.transformer.layer.0.rel_attn.q 	 torch.Size([64, 4, 16])
heads.0.body.2.transformer.layer.0.rel_attn.k 	 torch.Size([64, 4, 16])
heads.0.body.2.transformer.layer.0.rel_attn.v 	 torch.Size([64, 4, 16])
heads.0.body.2.transformer.layer.0.rel_attn.o 	 torch.Size([64, 4, 16])
heads.0.body.2.transformer.layer.0.rel_attn.r 	 to

## Training 

- Basic fit  and evaluate to test the model training:

- Load arguments:  
    N.B: These classes will be updated to keep only what required by recsys_trainer

In [22]:
# Instantiate the RecSysTrainer, which manages training and evaluation
from transformers4rec.recsys_trainer import RecSysTrainer, DatasetType

trainer = RecSysTrainer(
    model=model_wp,
    args=TrainingArguments,
    model_args=ModelArguments,
    data_args=DataArguments,
)

- Fit the model : 

In [23]:
trainer.set_train_dataloader(dl_loader)
trainer.reset_lr_scheduler()
trainer.train()

 ** On entry to SGEMM  parameter number 10 had an illegal value


RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`