# Transformer (Huggingface) for RecSys
eCommerce dataset: https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category

## Data Load on PyTorch (Example)

Reference: PetaStorm (allow us to load parquet data in PyTorch or Tensorflow
- https://docs.databricks.com/applications/deep-learning/data-prep/petastorm.html#
- https://petastorm.readthedocs.io/en/latest/api.html?highlight=array#petastorm.spark.spark_dataset_converter.vector_to_array


In [None]:
# !pip install petastorm
# !pip install pyarrow==0.17.0

In [1]:
from petastorm.spark import SparkDatasetConverter, make_spark_converter
import numpy as np

### Load existing dataset

In [2]:
train_urls = [
'file:///root/dataset/20200617044630-appid-local-1592357472062-5618370f-419b-4421-be76-1ef1bc3df293/part-00000-e9c67d19-de9c-4ba0-934d-7becf19a483c-c000.parquet',
'file:///root/dataset/20200617044630-appid-local-1592357472062-5618370f-419b-4421-be76-1ef1bc3df293/part-00001-e9c67d19-de9c-4ba0-934d-7becf19a483c-c000.parquet'
]

In [3]:
eval_urls = [
'file:///root/dataset/20200617045047-appid-local-1592357472062-4796c1ec-0f9a-4ddb-b736-66a0ca1acb99/part-00000-e40aad43-ebb1-47d1-ad55-5f2ee63d9034-c000.parquet',
'file:///root/dataset/20200617045047-appid-local-1592357472062-4796c1ec-0f9a-4ddb-b736-66a0ca1acb99/part-00001-e40aad43-ebb1-47d1-ad55-5f2ee63d9034-c000.parquet'
]

In [4]:
converter_train = SparkDatasetConverter(cache_dir_url='file:///root/dataset', file_urls=train_urls, dataset_size=4890191)
converter_val = SparkDatasetConverter(cache_dir_url='file:///root/dataset', file_urls=eval_urls, dataset_size=543143)





In [5]:
print(f"train: {len(converter_train)}, val: {len(converter_val)}")


train: 4890191, val: 543143


#### Create Petastorm Data Schema
Currently we're not using this choice. This could be used to create custom Petastorm data schema to load variable-length array into dataloder

API: https://petastorm.readthedocs.io/en/latest/api.html?module-petastorm.codecs
Example: https://github.com/uber/petastorm/blob/9a903d9885759674f81b412dc133aeb9577e00dd/examples/spark_dataset_converter/pytorch_converter_example.py

In [6]:
from petastorm.unischema import UnischemaField
from petastorm.unischema import Unischema

from petastorm.codecs import NdarrayCodec


In [7]:
recsys_schema = [
#     UnischemaField('user_session', np.string_, (None), NdarrayCodec(), True),
    UnischemaField('pid_seq_zpd', np.longlong, (None), NdarrayCodec(), True),
    UnischemaField('cid_seq_zpd', np.longlong, (None), NdarrayCodec(), True),
    UnischemaField('dtime_seq_zpd', np.int, (None), NdarrayCodec(), True),
    UnischemaField('et_hour_seq_zpd', np.int, (None), NdarrayCodec(), True),
    UnischemaField('et_month_seq_zpd', np.int, (None), NdarrayCodec(), True), 
    UnischemaField('et_dayofweek_seq_zpd', np.int, (None), NdarrayCodec(), True),
    UnischemaField('et_dayofmonth_seq_zpd', np.int, (None), NdarrayCodec(), True),
]

recsys_schema_pid = [
    UnischemaField('pid_seq_zpd', np.longlong, (None), NdarrayCodec(), True),
]

### Sample PyTorch DataLoader

In [31]:
BATCH_SIZE = 256
NUM_EPOCHS = 10

#### DataCollatorForLanguageModeling


In [None]:
del DataCollatorForLanguageModelingRecSys

In [47]:
# from transformers_recsys2.data import DataCollatorForLanguageModelingRecSys
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, NewType, Tuple

import torch
from torch.nn.utils.rnn import pad_sequence


@dataclass
class DataCollatorForLanguageModelingRecSys:
    """
    Data collator used for language modeling.
    - collates batches of tensors, honoring their tokenizer's pad_token
    - preprocesses batches for masked language modeling
    """
    target_col: str = None
    _pad_token: int = 0

    def __call__(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
#         print(examples)
        _examples = [torch.LongTensor(example[self.target_col]) for example in examples]
#         print('after transform:')
#         print(_examples)
        batch = self._tensorize_batch(_examples)
        return {"input_ids": batch, "labels": batch}

    def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor:
        length_of_first = examples[0].size(0)
        are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
        if are_tensors_same_length:
            return torch.stack(examples, dim=0)
        else:
            return pad_sequence(examples, batch_first=True, padding_value=self._pad_token)



print(examples): 
    [{"col_name":array(...)}, {"col_name":array(...)}, {"col_name":array(...)}, {"col_name":array(...)}...]

In [48]:
data_collator = DataCollatorForLanguageModelingRecSys(
    target_col="pid_seq_zpd", 
    _pad_token=0
)


### Create custom data loader 
Background: `make_torch_dataloader` returns context manager. To work with huggingface's trainer, we should obtain non-context-manager based data loader

In [55]:
from petastorm.reader import make_batch_reader
from petastorm.pytorch import BatchedDataLoader, DataLoader
from functools import partial
import torch 

train_reader = make_batch_reader(converter_train.file_urls, 
                                 schema_fields=recsys_schema_pid,
                                num_epochs=NUM_EPOCHS)
valid_reader = make_batch_reader(converter_val.file_urls, 
                                 schema_fields=recsys_schema_pid,
                                num_epochs=NUM_EPOCHS)

In [36]:
# train_loader = BatchedDataLoader(train_reader, shuffling_queue_capacity=BATCH_SIZE * 10, batch_size=BATCH_SIZE,
#                            transform_fn=partial(torch.as_tensor, device=device))
# valid_loader = BatchedDataLoader(valid_reader, shuffling_queue_capacity=BATCH_SIZE * 10, batch_size=BATCH_SIZE,
#                            transform_fn=partial(torch.as_tensor, device=device))

train_loader = DataLoader(train_reader, 
                          batch_size=BATCH_SIZE,
                          collate_fn=data_collator)
valid_loader = DataLoader(valid_reader, 
                          batch_size=BATCH_SIZE,
                          collate_fn=data_collator)

In [35]:
del DLWrapper
del get_train_dataloader

In [37]:
class EmptyDataSet(object):
    def __init__(self, data_size):
        self.len = data_size
        
    def __len__(self):
        return self.len

class DLWrapper(object):
    def __init__(self, init_loader, data_size=100):
        self.loader = init_loader
        self.data_size = data_size
        self.dataset = EmptyDataSet(data_size)
        
    def __len__(self):
        return self.data_size
    
    def __iter__(self):
        yield iter(self.loader)
        
    def _yield_batches(self,keys):
        return self.loader._yield_batches(keys)
    
    def __enter__(self):
        return self.loader
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.loader.reader.stop()
        self.loader.reader.join()

In [40]:
train_dl_wrapper = DLWrapper(train_loader, converter_train.dataset_size) ## train_loader
valid_dl_wrapper = DLWrapper(valid_loader, converter_val.dataset_size)

# def get_train_dataloader():
#     return train_dl_wrapper
# def get_valid_dataloader():
#     return valid_dl_wrapper

## Transformer

In [None]:
# ! pip install filelock
# ! pip install tokenizers
# ! pip install sentencepiece
# ! pip install sacremoses
# ! pip install 

In [21]:
import sys
sys.path.append("./")
# from transformers.test import hello
# print(hello)


In [41]:

from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=150_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [42]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)



#### Column name re mapping

In [16]:
# col_name_mapping = {"pid_seq_zpd":"input_ids"}
col_name_mapping=None

#### Tip: remove cached class to get fresher file

In [43]:
del Trainer
del TrainingArguments

In [44]:
try:
    del sys.modules['transformers']
except AttributeError:
    pass
import transformers.trainer



In [45]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./trained/",
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_gpu_train_batch_size=BATCH_SIZE,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    prediction_loss_only=True,
    ps_train_dataloader=train_dl_wrapper,
    ps_eval_dataloader=valid_dl_wrapper,    
)

hi2


In [46]:
trainer.train()



Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4890191.0, style=ProgressStyle(descriptio…

[{'pid_seq_zpd': array([773, 773,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])}, {'pid_seq_zpd': array([91, 91,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0])}, {'pid_seq_zpd': array([ 3068,  3018,  1293,  8394,  1081, 20071, 65141, 26629, 16420,
       11952,  5824, 10136, 60784, 22861, 28707, 26581,     0,     0,
           0,     0])}, {'pid_seq_zpd': array([1021, 1006,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0])}, {'pid_seq_zpd': array([159, 159, 159,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])}, {'pid_seq_zpd': array([11090,  9208,     2,   230,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0])}, {'pid_seq_zpd': array([  30, 1526,   69,   69,  362, 6333,  362,  362,  362,   74,   74,
         74,  249,

after transform:
[tensor([773, 773,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0]), tensor([91, 91,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]), tensor([ 3068,  3018,  1293,  8394,  1081, 20071, 65141, 26629, 16420, 11952,
         5824, 10136, 60784, 22861, 28707, 26581,     0,     0,     0,     0]), tensor([1021, 1006,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), tensor([159, 159, 159,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0]), tensor([11090,  9208,     2,   230,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), tensor([  30, 1526,   69,   69,  362, 6333,  362,  362,  362,   74,   74,   74,
         249,  359,  359,  249,    0,    0,    0,    0]), tensor([  435,   317,   435, 64374,   408,   435,   317,




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4890191.0, style=ProgressStyle(descriptio…





NotImplementedError: Currently do not support resetting a reader while in the middle of iteration. You can call reset only after all samples were consumed.