In [1]:
from os import makedirs
from os.path import join
import logging
import numpy as np
import torch
import random
import sys

In [2]:
sys.path.append('../')
from torchsummary import summary
from torch.utils.data import DataLoader

In [5]:
from args import define_main_parser

from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

#from dataset.prsa import PRSADataset
from data.card import TransactionDataset
from models.modules import TabFormerBertLM
from scripts.utils import random_split_dataset
from data.datacollator import TransDataCollatorForLanguageModeling


In [9]:
from argparse import Namespace
config = vars(Namespace(cached=False, checkpoint=0, data_extension='', data_fname='card_transaction.v1', data_root='./data/credit_card/', data_type='card', do_eval=False, do_train=True, field_ce=True, field_hs=64, flatten=False, jid=1, lm_type='bert', log_dir='sam/logs', mlm=True, mlm_prob=0.15, nrows=None, num_train_epochs=3, output_dir='sam', save_steps=500, seed=9, skip_user=False, stride=5, user_ids=None, vocab_file='vocab.nb'))
config['data_root'] = "../dataset/credit_card/"
config['output_dir'] = "sample"
config['log_dir'] = "sample/logs"
makedirs(config['output_dir'], exist_ok=True)
makedirs(config['log_dir'], exist_ok=True)


In [10]:
seed = config['seed']
random.seed(seed)  # python
np.random.seed(seed)  # numpy
torch.manual_seed(seed)  # torch
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)  # torch.cuda

In [11]:
dataset = TransactionDataset(root=config['data_root'],
                            fname=config['data_fname'],
                            fextension="",
                            vocab_dir=config['output_dir'],
                            nrows=None,
                            user_ids=None,
                            seq_len=20,
                            mlm=config['mlm'],
                            cached=config['cached'],
                            stride=10,
                            flatten=config['flatten'],
                            return_labels=False,
                            skip_user=True)

In [7]:
dataset.seq_len

20

In [8]:
vocab = dataset.vocab
custom_special_tokens = vocab.get_special_tokens()

totalN = len(dataset)
totalN = len(dataset)
trainN = int(0.6 * totalN)

valtestN = totalN - trainN
valN = int(valtestN * 0.5)
testN = valtestN - valN

In [9]:
lengths = [trainN, valN, testN]

In [10]:
print(f"# lengths: train [{trainN}]  valid [{valN}]  test [{testN}]")
print("# lengths: train [{:.2f}]  valid [{:.2f}]  test [{:.2f}]".format(trainN / totalN, valN / totalN,
                                                                               testN / totalN))

# lengths: train [599]  valid [200]  test [200]
# lengths: train [0.60]  valid [0.20]  test [0.20]


In [11]:
train_dataset, eval_dataset, test_dataset = random_split_dataset(dataset, lengths)

In [12]:
#train_dataset.dataset.__getitem__(1899)

In [13]:
tab_net = TabFormerBertLM(custom_special_tokens,
                                  vocab=vocab,
                                  field_ce=config['field_ce'],
                                  flatten=config['flatten'],
                                  ncols=dataset.ncols,
                                  field_hidden_size=config['field_hs']
                                  )

In [14]:
collactor_cls = "TransDataCollatorForLanguageModeling"
data_collator = eval(collactor_cls)(
        tokenizer=tab_net.tokenizer, mlm=config['mlm'], mlm_probability=config['mlm_prob']
    )

In [15]:
training_args = TrainingArguments(
        output_dir=config['output_dir'],  # output directory
        num_train_epochs=config['num_train_epochs'],  # total number of training epochs
        logging_dir=config['log_dir'],  # directory for storing logs
        save_steps=config['save_steps'],
        do_train=config['do_train'],
        # do_eval=args.do_eval,
        # evaluation_strategy="epoch",
        prediction_loss_only=True,
        overwrite_output_dir=True,
        # eval_steps=10000
    )

In [16]:
trainer = Trainer(
        model=tab_net.model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

In [17]:
train_dataloader = DataLoader(
            train_dataset,
            batch_size=10,
            collate_fn=data_collator        )

In [18]:
#trainer.train()


In [19]:
model = tab_net.model

In [25]:
for inps in train_dataloader:
    print(inps.keys())
    print(inps['input_ids'].shape)
    print(inps['masked_lm_labels'].shape)
    #print(inps['masked_lm_labels'], )
    aa, out =model(**inps)
    print(len(aa))
    print("Length of out - ", len(out))
    print('Ouput -', out[0].shape)
    #print('Ouput -', out[1].shape)
    print(aa[0])
    print(aa[1].shape)
    #aa.last_hidden_state
    break

dict_keys(['input_ids', 'masked_lm_labels'])
torch.Size([10, 20, 11])
torch.Size([10, 20, 11])
Last Hidden State shape torch.Size([10, 20, 704])
Sequence Output shape - torch.Size([10, 20, 704])
Output shape - [10, 20, 704]
Expected shape - [10, 220, -1]
Sequence output - torch.Size([10, 220, 64])
Masked lm labels - torch.Size([10, 220])
Prediction score shape - torch.Size([10, 220, 1062])
[0, 11, 22, 33, 44, 55, 66, 77, 88, 99, 110, 121, 132, 143, 154, 165, 176, 187, 198, 209] for Card
Prediction score for loss - torch.Size([200, 3])
Masked score for loss - torch.Size([200])
Masked loss - torch.Size([])
[1, 12, 23, 34, 45, 56, 67, 78, 89, 100, 111, 122, 133, 144, 155, 166, 177, 188, 199, 210] for Timestamp
Prediction score for loss - torch.Size([200, 10])
Masked score for loss - torch.Size([200])
Masked loss - torch.Size([])
[2, 13, 24, 35, 46, 57, 68, 79, 90, 101, 112, 123, 134, 145, 156, 167, 178, 189, 200, 211] for Amount
Prediction score for loss - torch.Size([200, 10])
Masked sco

In [79]:
#torch.save()

In [78]:
#trainer.train()

In [22]:
from transformers import BertModel

In [49]:
tab_net.model.save_pretrained('new_mode')

Configuration saved in new_mode/config.json
Model weights saved in new_mode/pytorch_model.bin


In [13]:
tab_net = TabFormerBertLM(custom_special_tokens,
                                  vocab=vocab,
                                  field_ce=config['field_ce'],
                                  flatten=config['flatten'],
                                  ncols=dataset.ncols,
                                  field_hidden_size=config['field_hs']
                                  )

In [34]:
#tab_net.model.tb_model.save_pretrained('new_mode')

In [33]:
#tab_net.model.tb_model = tab_net.model.tb_model.from_pretrained('new_mode',vocab=vocab)

In [29]:
#tab_net.model.tab_embeddings

In [30]:
#tab_net.model.tb_model = tab_net.model.tb_model.from_pretrained('new_mode')

In [31]:
#summary(tab_net.model.tb_model)

In [32]:
#tab_net.model.tb_model.bert.encoder

In [20]:
from models.lstm_classifier import LSTM

In [28]:
bert_fe_model = tab_net.model

In [21]:
classifier_model = LSTM(emb_inp_size=1062)

In [25]:
import torch.nn as nn


In [35]:
tab_net = TabFormerBertLM(custom_special_tokens,
                                  vocab=vocab,
                                  field_ce=config['field_ce'],
                                  flatten=config['flatten'],
                                  ncols=dataset.ncols,
                                  field_hidden_size=config['field_hs']
                                  )

In [96]:
class Classifier(nn.Module):
    def __init__(self, custom_special_tokens,
                 vocab,
                 field_ce,
                 flatten,
                 ncols,
                 field_hidden_size,
                 bert_feature_size
                 ):
        super(Classifier, self).__init__()
        self.tab_net = TabFormerBertLM(custom_special_tokens,
                                        vocab=vocab,
                                        field_ce=field_ce,
                                        flatten=flatten,
                                        ncols=ncols,
                                        field_hidden_size=field_hidden_size)

        self.field_transformer = self.tab_net.model.tab_embeddings
        self.bert = self.tab_net.model.tb_model


        self.classifier = LSTM(emb_inp_size=bert_feature_size)

    
    def forward(self, input_ids ,input_args):
        field_embeddings = self.field_transformer(input_ids)
        #input_args['input_ids'] = input_ids
        bert_features = self.bert(inputs_embeds=field_embeddings, **input_args)
        
        bert_features = bert_features[1]
        bert_features = bert_features.reshape((10, 20, 11, 1062))
        bert_features = bert_features.reshape((200, 11, 1062))
        cls_out = self.classifier(bert_features, T.as_tensor(([11])))
        return cls_out
    

In [97]:
classif = Classifier(custom_special_tokens,
                 vocab=vocab,
                    field_ce=config['field_ce'],
                    flatten=config['flatten'],
                    ncols=dataset.ncols,
                    field_hidden_size=config['field_hs'],
                 bert_feature_size=1062)

In [98]:
#summary(classif)

In [99]:
import torch as T
T.as_tensor(([232]))

tensor([232])

In [100]:
for inps in train_dataloader:
    input_ids = inps.pop('input_ids')
    print(input_ids.shape)
    print(inps['masked_lm_labels'].shape)
    #print(inps['masked_lm_labels'], )
    print(inps.keys())
    
    class_out =classif(input_ids, inps)
    
    print(f"Class out - {class_out.shape}")
    print(class_out)
    break

torch.Size([10, 20, 11])
torch.Size([10, 20, 11])
dict_keys(['masked_lm_labels'])
Last Hidden State shape torch.Size([10, 20, 704])
Sequence Output shape - torch.Size([10, 20, 704])
Output shape - [10, 20, 704]
Expected shape - [10, 220, -1]
Sequence output - torch.Size([10, 220, 64])
Masked lm labels - torch.Size([10, 220])
Prediction score shape - torch.Size([10, 220, 1062])
[0, 11, 22, 33, 44, 55, 66, 77, 88, 99, 110, 121, 132, 143, 154, 165, 176, 187, 198, 209] for Card
Prediction score for loss - torch.Size([200, 3])
Masked score for loss - torch.Size([200])
Masked loss - torch.Size([])
[1, 12, 23, 34, 45, 56, 67, 78, 89, 100, 111, 122, 133, 144, 155, 166, 177, 188, 199, 210] for Timestamp
Prediction score for loss - torch.Size([200, 10])
Masked score for loss - torch.Size([200])
Masked loss - torch.Size([])
[2, 13, 24, 35, 46, 57, 68, 79, 90, 101, 112, 123, 134, 145, 156, 167, 178, 189, 200, 211] for Amount
Prediction score for loss - torch.Size([200, 10])
Masked score for loss -

1. append classifier with bert
2. Freeze bert model after first train
3. Use bert model and train the classifier