In [1]:
from feature_process import get_feature_process
from recsys_data import get_nvtabular_dataloader
from dataclasses import dataclass

- We define two feature maps for two FeatureGroups : 

        - The first FeatureGroup uses all features present in ecomrees dataset. 
        
        - The second FeatureGroup contains only the item-id column.

In [2]:
feature_group_configs = [{ 'name' : 'session_based_features_all',
                          'feature_map' : "/workspace/transformerlib/Transformers4Rec/datasets/ecommerce_rees46/config/features/session_based_features_all.yaml"},
                         
                         { 'name' : 'session_based_features_itemid',
                          'feature_map' : "/workspace/transformerlib/Transformers4Rec/datasets/ecommerce_rees46/config/features/session_based_features_itemid.yaml"}
                         ]

## Load data 

In [3]:
@dataclass
class training_args: 
    local_rank = -1
    dataloader_drop_last = True
    
@dataclass 
class data_args: 
    session_seq_length_max = 20
    nvt_part_mem_fraction = 0.7
    nvt_part_size = None
    
data_paths = ['/data/0001/train.parquet', '/data/0002/train.parquet']

batch_size = 128

In [4]:
import yaml 
feature_maps = []
for config in feature_group_configs: 
    with open(config['feature_map']) as yaml_file:
        feature_maps.append(yaml.load(yaml_file, Loader=yaml.FullLoader))
general_feature_map = feature_maps[0]
general_feature_map.update(feature_maps[1])
    
loader = get_nvtabular_dataloader(data_args, training_args, general_feature_map, data_paths, batch_size)
it = iter(loader)
first = next(it)
second = next(it)



## Define FeatureProcess class 

- Get FeatureProcess module 

In [5]:
feature_process = get_feature_process(feature_group_configs)

- Check FeatureProcess output

In [6]:
out = feature_process(first)

    - Aggregated output of first sequence: 

In [7]:
out.feature_groups[0].values.shape

torch.Size([128, 20, 1408])

    - Aggregated output of second sequence:  

In [8]:
out.feature_groups[1].values.shape

torch.Size([128, 20, 128])

    - Label columns 

In [9]:
out.label_groups

[LabelFeature(type='item_prediction', label_column='sess_pid_seq', dimension=390000),
 LabelFeature(type='classification', label_column='sess_ccid_seq', dimension=150),
 LabelFeature(type='item_prediction', label_column='sess_pid_seq', dimension=390000)]

- columns to log as metadata 

In [10]:
out.metadata_features

['sess_price_log_norm_seq',
 'sess_relative_price_to_avg_category_seq',
 'sess_prod_recency_days_log_norm_seq',
 'sess_et_hour_sin_seq',
 'sess_et_hour_cos_seq',
 'sess_et_dayofweek_sin_seq',
 'sess_et_dayofweek_cos_seq',
 'sess_pid_seq',
 'sess_ccid_seq',
 'sess_csid_seq',
 'sess_bid_seq',
 'sess_pid_seq']

## Define Masking class 

In [12]:
from mask_sequence import MaskSequence

- Each sequence is related to its own masking scheme 

In [14]:
# masking the first sequence with all features using plm 
mask_module_1 = MaskSequence(task='plm', hidden_size=1408)

# masking the second sequence with item-id using mlm 
mask_module_2 = MaskSequence(task='mlm', hidden_size=128)

- Masking first sequence 

In [28]:
input_sequence = out.feature_groups[0].values
itemid_seq =  first[feature_process.feature_groups[0].itemid_name]
pos_emb_inp_1, labels_1, mask_labels_1, plm_target_mapping_1, plm_perm_mask_1 = mask_module_1(input_sequence,
                                                                                    itemid_seq,
                                                                                    training = True)

In [29]:
pos_emb_inp_1.shape, plm_target_mapping_1.shape, plm_perm_mask_1.shape

(torch.Size([128, 20, 1408]),
 torch.Size([128, 20, 20]),
 torch.Size([128, 20, 20]))

In [33]:
labels_2

tensor([[     0,      0,   1406,  ...,      0,      0,      0],
        [  8218,      0, 118441,  ...,      0,      0,      0],
        [     0,  10804,      0,  ...,      0,      0,      0],
        ...,
        [    61,      0,      0,  ...,      0,      0,      0],
        [   260,      0,   1567,  ...,      0,      0,      0],
        [     0,     13,     31,  ...,      0,      0,      0]],
       device='cuda:0')

- Masking second sequence 

In [31]:
input_sequence = out.feature_groups[1].values
itemid_seq =  first[feature_process.feature_groups[1].itemid_name]
pos_emb_inp_2, labels_2, mask_labels_2, plm_target_mapping_2, plm_perm_mask_2 = mask_module_2(input_sequence,
                                                                                    itemid_seq,
                                                                                    True)

In [32]:
pos_emb_inp_2.shape, plm_target_mapping_2,  plm_perm_mask_2

(torch.Size([128, 20, 128]), None, None)

In [34]:
labels_2

tensor([[     0,      0,   1406,  ...,      0,      0,      0],
        [  8218,      0, 118441,  ...,      0,      0,      0],
        [     0,  10804,      0,  ...,      0,      0,      0],
        ...,
        [    61,      0,      0,  ...,      0,      0,      0],
        [   260,      0,   1567,  ...,      0,      0,      0],
        [     0,     13,     31,  ...,      0,      0,      0]],
       device='cuda:0')

## Define Tower models 

In [37]:
from tower_model import *

- Define the model block for each feature group 

In [38]:
#model_tye, hidden_size, n_head, n_layer, total_seq_length
models = [['xlnet', 1408, 4, 2, 20], ['lstm', 128, 0, 4, 21]]

In [39]:
model = TowerModel(models)

- Get tower outputs 

In [40]:
out_1, out_2 = model([
    [pos_emb_inp_1, {'task': 'plm', 'target_mapping':plm_target_mapping, 'perm_mask':plm_perm_mask}], 
    [pos_emb_inp_2,  {'task': 'mlm'}],
])

In [41]:
out_1[0].shape, len(out_1[1])

(torch.Size([128, 20, 1408]), 2)

In [42]:
out_2[0].shape, len(out_2[1])

(torch.Size([128, 20, 128]), 1)

## Define Prediction Head 

#TBD 