In [1]:
from dataclasses import dataclass
import yaml 

from torch import nn 

from recsys_data import get_nvtabular_dataloader

from feature_process import get_feature_process 
from mask_sequence import MLM, CLM, PLM, RTD, get_masking_task
from tower_model import TowerModel 
from prediction_head import ItemPrediction
from meta_model import MetaModel

- The meta-architecture is converted to 4 main submodules: 


   - **FeatureProcess:** 
        * Process multiple FeatureGroup to create the list of interactions embeddings. 
        * A FeatureGroup is defined as the combination of categoricals and continuous features (sequential or not) with the same shape. 
        * Each FeatureGroup is affected to a config file that specifies the representation types of each input and the aggregation mode. 
        * FeatureProcessOutput contains also the list of LabelFeature classes, supporting multi-task prediction : classification and/or regression and/or item prediction
        * Each LabelFeature is an inventory dataclass containing three variables : type, label_column and dimension 
         
         
   - **MaskSequence:** 
        * Create the masking schema and prepare the masked inputs and labels for the selected LM task. 
        * A base MaskSequence class is created to init common parameters and four PyTorch modules are defined : CLM, MLM, PLM and RTD
        * The MaskSequenceOutput contains four tensors: masked_input, masked_label, mask_schema, plm_target_mapping, plm_perm_mask. 
       
       
   -  **TowerModel:** 
       * Define the model block related to a given group of features.
       * The input is either FeatureGroup or MaskSequenceOutput.
       * The supported models are: HF Transformers, AvgSeq, LSTM, GRU and Gru4Rec.
       * The module returns TowerOutput containing two information: the sequence hidden representation and the tuple (attention_weights, hidden_states).
     
     
   - **PredictionHead** 
       * Extend Merlin Model "Task" class defined by Marc to define ItemPrediction Task 
       * Define the prediction task related to a given group of features. 
       * The supported tasks are: item prediction, classification and regression. 
       * The inputs are:  TowerOutput
       * The module returns predictions tensor
       
- The general **MetaModel** runs the end-to-end workflow and currently support item-prediction task 
       

- To test the outputs of Meta-model submodules, we consider two feature maps for two FeatureGroups from the ecomrees46 dataset : 

        - The first FeatureGroup uses all features present in ecomrees dataset. 
        
        - The second FeatureGroup contains only the item-id column.

In [2]:
feature_group_configs = [{ 'name' : 'session_based_features_all',
                          'feature_map' : "/workspace/transformerlib/Transformers4Rec/datasets/ecommerce_rees46/config/features/session_based_features_all.yaml"},
                         
                         { 'name' : 'session_based_features_itemid',
                          'feature_map' : "/workspace/transformerlib/Transformers4Rec/datasets/ecommerce_rees46/config/features/session_based_features_itemid.yaml"}
                         ]

### Load a batch of ecomrees data for testing

In [3]:
@dataclass
class training_args: 
    local_rank = -1
    dataloader_drop_last = True
    
@dataclass 
class data_args: 
    session_seq_length_max = 20
    nvt_part_mem_fraction = 0.7
    nvt_part_size = None
    
data_paths = ['/data/0001/train.parquet', '/data/0002/train.parquet']

batch_size = 8

feature_maps = []
for config in feature_group_configs: 
    with open(config['feature_map']) as yaml_file:
        feature_maps.append(yaml.load(yaml_file, Loader=yaml.FullLoader))
general_feature_map = feature_maps[0]
general_feature_map.update(feature_maps[1])
loader = get_nvtabular_dataloader(data_args, training_args, general_feature_map, data_paths, batch_size)
it = iter(loader)
first = next(it)



### End-to-End Meta-Model 

In [4]:
# meta-class for next item prediction with all features 
meta_model = MetaModel(feature_group_config=[feature_group_configs[0]], model_type='xlnet', masking_task='mlm', max_seq_length=20, n_head=4, n_layer=2)

In [5]:
meta_model

MetaModel(
  (feature_group): FeatureGroupProcess(
    (aggregate): Aggregation()
  )
  (mask_task): MLM()
  (tower_model): TowerModel(
    (model): XLNetModel(
      (word_embedding): Embedding(1, 1408)
      (layer): ModuleList(
        (0): XLNetLayer(
          (rel_attn): XLNetRelativeAttention(
            (layer_norm): LayerNorm((1408,), eps=0.03, elementwise_affine=True)
            (dropout): Dropout(p=0.3, inplace=False)
          )
          (ff): XLNetFeedForward(
            (layer_norm): LayerNorm((1408,), eps=0.03, elementwise_affine=True)
            (layer_1): Linear(in_features=1408, out_features=5632, bias=True)
            (layer_2): Linear(in_features=5632, out_features=1408, bias=True)
            (dropout): Dropout(p=0.3, inplace=False)
          )
          (dropout): Dropout(p=0.3, inplace=False)
        )
        (1): XLNetLayer(
          (rel_attn): XLNetRelativeAttention(
            (layer_norm): LayerNorm((1408,), eps=0.03, elementwise_affine=True)
      

In [6]:
output = meta_model(first, training=True)

In [7]:
output.keys()

dict_keys(['loss', 'labels', 'predictions', 'model_outputs'])

In [8]:
output['loss']

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)

In [9]:
output['predictions']

tensor([[-0.0488,  0.0187,  0.0107,  ...,  0.1496,  0.0336,  0.0942],
        [-0.0674,  0.0841,  0.0347,  ...,  0.0957,  0.0773,  0.0918],
        [-0.0330,  0.0829, -0.0055,  ...,  0.0299,  0.0080,  0.1231],
        ...,
        [-0.0165, -0.0401, -0.0442,  ..., -0.0633, -0.0176,  0.1336],
        [-0.0735,  0.0090, -0.0296,  ...,  0.0582, -0.0360,  0.0660],
        [ 0.0124, -0.0245,  0.0098,  ..., -0.0351, -0.0303, -0.0415]],
       device='cuda:0', grad_fn=<AddmmBackward>)

## Step by step testing 

### Define FeatureProcess class 

- Get FeatureProcess module 

In [10]:
feature_process = get_feature_process(feature_group_configs)

- Check FeatureProcess output

In [11]:
out = feature_process(first)

    - Aggregated output of the first sequence: 

In [12]:
out.feature_groups[0].values.shape

torch.Size([8, 20, 1408])

    - Aggregated output of the second sequence:  

In [13]:
out.feature_groups[1].values.shape

torch.Size([8, 20, 128])

    - Label columns 

In [14]:
out.label_groups

[LabelFeature(type='item_prediction', label_column='sess_pid_seq', dimension=390000),
 LabelFeature(type='classification', label_column='sess_ccid_seq', dimension=150),
 LabelFeature(type='item_prediction', label_column='sess_pid_seq', dimension=390000)]

- columns to log as metadata 

In [15]:
out.metadata_features

['sess_price_log_norm_seq',
 'sess_relative_price_to_avg_category_seq',
 'sess_prod_recency_days_log_norm_seq',
 'sess_et_hour_sin_seq',
 'sess_et_hour_cos_seq',
 'sess_et_dayofweek_sin_seq',
 'sess_et_dayofweek_cos_seq',
 'sess_pid_seq',
 'sess_ccid_seq',
 'sess_csid_seq',
 'sess_bid_seq',
 'sess_pid_seq']

### Define Masking class 

- Each sequence is related to its own masking scheme 

In [16]:
# masking the first sequence with all features using plm 
mask_module_1 = PLM(hidden_size=1408)

# masking the second sequence with item-id using mlm 
mask_module_2 = MLM(hidden_size=128)

- Masking first sequence 

In [17]:
input_sequence = out.feature_groups[0].values
itemid_seq =  first[feature_process.feature_groups[0].itemid_name]
plm_out = mask_module_1(input_sequence, itemid_seq, training = True)

In [18]:
plm_out.masked_label

tensor([[     0,      0,      0,    251,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [  8218,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [     0,      0,      0,      0,      0,      0,   3641,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [     0,   1822,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [     0,      0,      0, 107833,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [   830,   2520,   1389,      0,      0,      0,    

- Masking second sequence 

In [19]:
input_sequence = out.feature_groups[1].values
itemid_seq =  first[feature_process.feature_groups[1].itemid_name]
mlm_out = mask_module_2(input_sequence,  itemid_seq,   True)

In [20]:
mlm_out.masked_label

tensor([[     0,      0,   1406,    251,   1661,    319,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [  8218,   9600,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [     0,  10804,      0,   6258,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [     0,   1822,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [     0,      0,      0,      0,  46551,      0, 107833,      0, 107833,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [     0,      0,   1389,      0,      0,      0,    

### Define Tower models 

- Define the model block for each feature group 

In [21]:
model_1 = TowerModel(max_seq_length=20, model_type='xlnet', hidden_size=1408, n_head=4, n_layer=2)

In [22]:
model_1

TowerModel(
  (model): XLNetModel(
    (word_embedding): Embedding(1, 1408)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((1408,), eps=0.03, elementwise_affine=True)
          (dropout): Dropout(p=0.3, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((1408,), eps=0.03, elementwise_affine=True)
          (layer_1): Linear(in_features=1408, out_features=5632, bias=True)
          (layer_2): Linear(in_features=5632, out_features=1408, bias=True)
          (dropout): Dropout(p=0.3, inplace=False)
        )
        (dropout): Dropout(p=0.3, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((1408,), eps=0.03, elementwise_affine=True)
          (dropout): Dropout(p=0.3, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((1408,), eps=0.03, elementwise_affine=Tru

- Get tower outputs 

In [23]:
model_1(plm_out).hidden_rep.shape

torch.Size([8, 20, 1408])

### Define Prediction Head 

In [24]:
body = nn.Linear(1408, 128).to('cuda')

In [27]:
import torch 
# flatten hidden_representation vectors and get predictions only for masked positions 
def remove_pad_3d(inp_tensor, non_pad_mask):
    # inp_tensor: (n_batch x seqlen x emb_dim)
    inp_tensor = inp_tensor.flatten(end_dim=1)
    inp_tensor_fl = torch.masked_select(
        inp_tensor, non_pad_mask.unsqueeze(1).expand_as(inp_tensor)
    )
    out_tensor = inp_tensor_fl.view(-1, inp_tensor.size(1))
    return out_tensor

In [28]:
trg_flat = plm_out.masked_label.flatten()
non_pad_mask = trg_flat != 0
labels_all = torch.masked_select(trg_flat, non_pad_mask)
pred_all = remove_pad_3d(model_1(plm_out).hidden_rep, non_pad_mask)

In [32]:
t = ItemPrediction(loss=nn.NLLLoss(ignore_index=0), task =out.label_groups[0], body = body, feature_process=feature_process.feature_groups[0])

In [33]:
t.compute_loss(inputs=pred_all, targets=labels_all)

tensor(0.0093, device='cuda:0', grad_fn=<NllLossBackward>)

In [34]:
t(pred_all).shape

torch.Size([10, 390000])