In [1]:
from os import makedirs
from os.path import join
import logging
import numpy as np
import torch
import random
import sys
from tqdm import tqdm

In [9]:
!pip install torchsummary

Defaulting to user installation because normal site-packages is not writeable
Collecting torchsummary
  Using cached torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [10]:
sys.path.append('../')
from torchsummary import summary
from torch.utils.data import DataLoader

In [11]:
from args import define_main_parser

from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

#from dataset.prsa import PRSADataset
from data.card import TransactionDataset
from models.modules import TabFormerBertLM
from scripts.utils import random_split_dataset
#from data.datacollator import TransDataCollatorForLanguageModeling
import data.datacollator as datacoll

In [12]:
from importlib import reload

In [13]:
reload(datacoll)

<module 'data.datacollator' from '/home/admin/murugesh/Clinical-Transformer/notebook/../data/datacollator.py'>

In [34]:
from argparse import Namespace
config = vars(Namespace(cached=False, checkpoint=0, data_extension='', data_fname='card_transaction.v3', data_root='./data/credit_card/', data_type='card', do_eval=False, do_train=True, field_ce=True, field_hs=64, flatten=False, jid=1, lm_type='bert', log_dir='sam/logs', mlm=True, mlm_prob=0.15, nrows=None, num_train_epochs=3, output_dir='sam', save_steps=500, seed=9, skip_user=False, stride=5, user_ids=None, vocab_file='vocab.nb'))
config['data_root'] = "../dataset/credit_card/"
config['output_dir'] = "sample"
config['log_dir'] = "sample/logs"
makedirs(config['output_dir'], exist_ok=True)
makedirs(config['log_dir'], exist_ok=True)

In [35]:
seed = config['seed']
random.seed(seed)  # python
np.random.seed(seed)  # numpy
torch.manual_seed(seed)  # torch
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)  # torch.cuda

In [36]:
dataset = TransactionDataset(root=config['data_root'],
                            fname=config['data_fname'],
                            fextension="",
                            vocab_dir=config['output_dir'],
                            nrows=None,
                            user_ids=None,
                            seq_len=20,
                            mlm=True,
                            cached=config['cached'],
                            stride=10,
                            flatten=config['flatten'],
                            return_labels=True,
                            skip_user=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 31.00it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:05<00:00,  3.38it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:03<00:00,  6.46it/s]


In [37]:
return_data = torch.tensor(dataset.data[20], dtype=torch.long)
return_data = torch.tensor(dataset.data[20], dtype=torch.long).reshape(dataset.seq_len, -1)
return_data2 = (return_data, torch.tensor(dataset.labels[20], dtype=torch.long))

  return_data2 = (return_data, torch.tensor(dataset.labels[20], dtype=torch.long))


In [38]:
#return_data2

In [39]:
b = [return_data2 for _ in range(200)]

In [40]:
labs = [torch.tensor(e[1], dtype=torch.long) for e in b]

  labs = [torch.tensor(e[1], dtype=torch.long) for e in b]


In [21]:
#labs

In [41]:
vocab = dataset.vocab
custom_special_tokens = vocab.get_special_tokens()

totalN = len(dataset)
totalN = len(dataset)
trainN = int(0.6 * totalN)

valtestN = totalN - trainN
valN = int(valtestN * 0.5)
testN = valtestN - valN
lengths = [trainN, valN, testN]
print(f"# lengths: train [{trainN}]  valid [{valN}]  test [{testN}]")
print("# lengths: train [{:.2f}]  valid [{:.2f}]  test [{:.2f}]".format(trainN / totalN, valN / totalN,
                                                                               testN / totalN))

# lengths: train [17709]  valid [5903]  test [5903]
# lengths: train [0.60]  valid [0.20]  test [0.20]


In [42]:
train_dataset, eval_dataset, test_dataset = random_split_dataset(dataset, lengths)

In [43]:
tab_net = TabFormerBertLM(custom_special_tokens,
                                  vocab=vocab,
                                  field_ce=config['field_ce'],
                                  flatten=config['flatten'],
                                  ncols=dataset.ncols,
                                  field_hidden_size=config['field_hs']
                                  )

In [44]:
collactor_cls = "TransDataCollatorForLanguageModeling"
data_collator = datacoll.TransDataCollatorForLanguageModeling(
        tokenizer=tab_net.tokenizer, mlm=True, mlm_probability=config['mlm_prob']
    )

In [52]:
train_dataloader = DataLoader(
            train_dataset,
            batch_size=100,
            collate_fn=data_collator)

In [53]:
# training_args = TrainingArguments(
#         output_dir=config['output_dir'],  # output directory
#         num_train_epochs=config['num_train_epochs'],  # total number of training epochs
#         logging_dir=config['log_dir'],  # directory for storing logs
#         save_steps=config['save_steps'],
#         do_train=config['do_train'],
#         # do_eval=args.do_eval,
#         # evaluation_strategy="epoch",
#         prediction_loss_only=True,
#         overwrite_output_dir=True,
#         # eval_steps=10000
#     )
# trainer = Trainer(
#         model=tab_net.model,
#         args=training_args,
#         data_collator=data_collator,
#         train_dataset=train_dataset,
#         eval_dataset=eval_dataset,
#     )

In [54]:
model = tab_net.model
model = model.to('cuda:3')

In [55]:
optim_params = {'betas': (0.9, 0.999), 'eps': 1e-08, 'lr': 5e-05}
optim = torch.optim.AdamW(model.parameters(), **optim_params)

In [56]:
model = model.eval()

In [57]:
for inps in train_dataloader:
    print(inps.keys())
    inps['masked_lm_labels'] = torch.clone(inps['input_ids'])
    #print(inps['input_ids'] == inps['masked_lm_labels'])
    #print(inps['Ouput'])
    break

dict_keys(['input_ids', 'masked_lm_labels', 'Ouput'])


In [61]:
total_loss = 0
for inps in tqdm(train_dataloader):
    #print(inps.keys())
    #print(inps['input_ids'].shape)
    #print(inps['masked_lm_labels'].shape)
    #print(inps['masked_lm_labels'], )
    optim.zero_grad()
    #print(inps['input_ids'].shape)
    labels = inps.pop("Ouput")
    model.train()
    inps['input_ids'] = inps['input_ids'].to('cuda:3')
    inps['masked_lm_labels'] = inps['masked_lm_labels'].to('cuda:3')
    outputs =model(**inps)
    
    loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
    print(loss)
    loss.backward()
    optim.step()
    total_loss += loss.item()
    break
    #print(loss)
    # print(len(aa))
    # print("Length of out - ", len(out))
    # print('Ouput -', out[0].shape)
    # #print('Ouput -', out[1].shape)
    # print(aa[0])
    # print(aa[1].shape)
    # #aa.last_hidden_state
    # break


  0%|                                                                                                     | 0/178 [00:00<?, ?it/s]

tensor(nan, device='cuda:3', grad_fn=<AddBackward0>)


  0%|                                                                                                     | 0/178 [00:00<?, ?it/s]


In [59]:
total_loss

nan

In [32]:
print(total_loss/len(train_dataloader))

nan


In [24]:
#torch.save()

In [25]:
#trainer.train()

In [26]:
from transformers import BertModel

In [None]:
model

In [65]:
model.save_pretrained('new_mode1')

In [None]:
# tab_net = TabFormerBertLM(custom_special_tokens,
#                                   vocab=vocab,
#                                   field_ce=config['field_ce'],
#                                   flatten=config['flatten'],
#                                   ncols=dataset.ncols,
#                                   field_hidden_size=config['field_hs']
#                                   )

In [34]:
#tab_net.model.tb_model.save_pretrained('new_mode')

In [103]:
loaded_val = torch.load('new_mode1/pytorch_model.bin')

In [104]:
model.load_state_dict(loaded_val)

<All keys matched successfully>

In [105]:
#tab_net.model.tb_model = tab_net.model.tb_model.from_pretrained('new_mode')

In [106]:
#summary(tab_net.model.tb_model)

In [107]:
#tab_net.model.tb_model.bert.encoder

In [1]:
import models.lstm_classifier as lstm_classfr
reload(lstm_classfr)

ModuleNotFoundError: No module named 'models'

In [121]:
bert_fe_model = tab_net.model

In [122]:
classifier_model = lstm_classfr.LSTM(emb_inp_size=1062)

In [123]:
import torch.nn as nn


In [124]:
tab_net = TabFormerBertLM(custom_special_tokens,
                                  vocab=vocab,
                                  field_ce=config['field_ce'],
                                  flatten=config['flatten'],
                                  ncols=dataset.ncols,
                                  field_hidden_size=config['field_hs']
                                  )

In [125]:
train_dataloader_cls = DataLoader(
            train_dataset,
            batch_size=200)

In [138]:
class Classifier(nn.Module):
    def __init__(self, custom_special_tokens,
                 vocab,
                 field_ce,
                 flatten,
                 ncols,
                 field_hidden_size,
                 bert_feature_size,
                 base_model
                 ):
        super(Classifier, self).__init__()
        '''
        self.tab_net = TabFormerBertLM(custom_special_tokens,
                                        vocab=vocab,
                                        field_ce=field_ce,
                                        flatten=flatten,
                                        ncols=ncols,
                                        field_hidden_size=field_hidden_size)
        '''
        loaded_val = torch.load('new_mode1/pytorch_model.bin')
        base_model.load_state_dict(loaded_val)
        #print(base_model)
        for p in base_model.parameters():
            p.requires_grad = True
        self.field_transformer = base_model.tab_embeddings
        self.bert = base_model.tb_model
        


        self.classifier = lstm_classfr.LSTM(emb_inp_size=bert_feature_size)

    
    def forward(self, input_ids ,input_args):
        field_embeddings = self.field_transformer(input_ids)
        #input_args['input_ids'] = input_ids
        bert_features = self.bert(inputs_embeds=field_embeddings, **input_args)
        
        bert_features = bert_features[1]
        print(bert_features.shape)
        bert_features = bert_features.reshape((50, 20, 11, 8721))
        bert_features = bert_features.reshape((50, 220, 8721))
        cls_out = self.classifier(bert_features, T.as_tensor(([11])))
        
        return cls_out
    

In [139]:
classif = Classifier(custom_special_tokens,
                 vocab=vocab,
                    field_ce=config['field_ce'],
                    flatten=config['flatten'],
                    ncols=dataset.ncols,
                    field_hidden_size=config['field_hs'],
                 bert_feature_size=8721, base_model=model)
classif = classif.to('cuda:3')

In [140]:
#model.tb_model

In [141]:
import torch as T
T.as_tensor(([232]))

tensor([232])

In [142]:
optimizer_fn = torch.optim.Adadelta(params=classif.parameters(), lr=0.1, rho=0.95, eps=1e-08)
loss_fn = T.nn.CrossEntropyLoss()

In [143]:
for inps in train_dataloader:
    print(inps.keys())
    input_ids = inps.pop('input_ids')
    out = inps.pop('Ouput')
    classif.train()
    
    #print(out)
    #print(input_ids.shape)
    #print(inps['masked_lm_labels'].shape)
    #print(inps['masked_lm_labels'], )
    #print(inps[0].shape)
    #print(inps[1].shape)
    input_ids = input_ids.to('cuda:3')
    inps['masked_lm_labels'] = inps['masked_lm_labels'].to('cuda:3')
    preds =classif(input_ids, inps)
    print(preds)
    preds = preds.view(preds.size(0))
    loss = loss_fn(preds.float(), out.float())
    loss.backward()
    optimizer.step()
    print(f"Class out - {class_out.shape}")
    print(class_out)
    #break

dict_keys(['input_ids', 'masked_lm_labels', 'Ouput'])
torch.Size([50, 220, 8721])
inp_shaep torch.Size([50, 220, 8721])
Before fc torch.Size([1, 256])
After fc torch.Size([1, 2])


TypeError: softmax() received an invalid combination of arguments - got (Tensor), but expected one of:
 * (Tensor input, int dim, torch.dtype dtype)
 * (Tensor input, name dim, *, torch.dtype dtype)


1. append classifier with bert
2. Freeze bert model after first train
3. Use bert model and train the classifier

In [50]:
rnn = T.nn.LSTM(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
c0 = torch.randn(2, 3, 20)
output, (hn, cn) = rnn(input, (h0, c0))

In [52]:
print(output.shape)

torch.Size([5, 3, 20])


In [53]:
print(hn.shape)

torch.Size([2, 3, 20])


print(hn.shape)