In [1]:
#standard imports
import os, time
from tqdm.auto import tqdm
from typing import List, Union
from re_sent_splitter import split_into_sentences
from pathlib import Path
import pathlib

#distributed imports
import torch
from torch.nn.parallel import DistributedDataParallel as DDP, DataParallel
from torch.utils.data import DistributedSampler, DataLoader

#tokenizers and datasets
from datasets import load_dataset
from tokenizers import BertWordPieceTokenizer 
from tokenizers.processors import TemplateProcessing
import tokenizers

#transformer imports
from transformers import BertTokenizer, DataCollatorForWholeWordMask, DataCollatorForLanguageModeling
from transformers import BertForMaskedLM, BertConfig, AdamW, TrainingArguments, Trainer
from transformers import pipeline

In [2]:
for d in range(4):
    print(torch.cuda.get_device_properties(d))

_CudaDeviceProperties(name='Tesla P100-PCIE-16GB', major=6, minor=0, total_memory=16280MB, multi_processor_count=56)
_CudaDeviceProperties(name='Tesla P100-PCIE-16GB', major=6, minor=0, total_memory=16280MB, multi_processor_count=56)
_CudaDeviceProperties(name='Tesla P100-PCIE-16GB', major=6, minor=0, total_memory=16280MB, multi_processor_count=56)
_CudaDeviceProperties(name='Tesla P100-PCIE-16GB', major=6, minor=0, total_memory=16280MB, multi_processor_count=56)


In [2]:
torch.cuda.device_count()

2

In [3]:
!nvidia-smi

Fri Mar 25 03:41:43 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.54       Driver Version: 510.54       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000001:00:00.0 Off |                    0 |
| N/A   31C    P0    26W / 250W |      2MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  Off  | 00000002:00:00.0 Off |                    0 |
| N/A   30C    P0    25W / 250W |      2MiB / 16384MiB |      0%      Default |
|       

#### Set Tokenizer and Data paths

In [6]:
vm_tok_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
vm_data = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/subsets/'
checkpoint_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/'
files = [f for f in os.listdir(vm_data) if f.endswith('25K')]
files
#local paths
# local_tok_path = '/Users/americanthinker1/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
# local_data = '/Users/americanthinker1/aws_data/processed_data/processed_chunks/english_docs_aa.txt'

['xabsplit_25K', 'xadsplit_25K', 'xacsplit_25K', 'xaasplit_25K']

#### Instantiate pretrained tokenizer from file

In [7]:
alternative_tokenizer = BertTokenizer.from_pretrained('../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt')

tokenizer = BertWordPieceTokenizer('../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt', strip_accents=True, lowercase=True)
tokenizer.enable_truncation(max_length=50)
tokenizer.enable_padding()
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ("[MASK]", tokenizer.token_to_id("[MASK]"))
    ],
)



In [5]:
#tokenizer.save_model('../Preprocessing/Tokenization/', prefix='BWPTokenizer')

In [12]:
test = files[:1]
test[0]

'xabsplit_25K'

#### Load data from local
Data is a 98,000 line file with each line representing one document of length ~12,000 characters from PubMed articles

In [44]:
#load data from disk
def load_data_from_disk(path: str, sample_size=None, min_tokens_per_sent: int=4) -> List[str]:
    '''
    Utility data loading function that performs the following operations:
       1. Loads data from disk into a list. Assumes each doc is one line.
       2. Performs sentence splitting on each document.
       3. Removes all sentences with tokens < 4 (default).
       4. Returns a list of sentences 
    '''
    #load data
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]

    #split data into sentences
    sentences = [split_into_sentences(i) for i in tqdm(lines, 'Sentence Splitter')]
    
    #remove all sentences with less than 5 tokens
    all_sentences = []
    for doc in tqdm(sentences, 'Filter Senteces'):
        for sentence in doc:
            if len(sentence.split()) > 4:
                all_sentences.append(sentence)
    print(f'Return a list of {len(all_sentences)} sentences')
    
    return all_sentences

In [45]:
results = load_data_from_disk(os.path.join(vm_data, test[0]), sample_size=10000)

Sentence Splitter:   0%|          | 0/10000 [00:00<?, ?it/s]

Filter Senteces:   0%|          | 0/10000 [00:00<?, ?it/s]

Return a list of 719533 sentences


#### Batch encode a chunk of data

In [46]:
s = time.perf_counter()
batch = tokenizer.encode_batch(results)
e = time.perf_counter() - s
print(round(e,2), 'seconds')


7.16 seconds


In [47]:
#decrease load on memory
del results

#### Create pipeline for random masking of 15% of input tokens

In [48]:
def mlm_pipe(batch: List[tokenizers.Encoding], mlm_prob=0.20) -> dict:
    '''
    Given a single instance from a batch of encodings, return masked inputs and associated arrays.
    Converts tokenizer.Encoding into a pytorch tensor.
    '''
    
    labels = torch.tensor([x.ids for x in tqdm(batch, 'Labels')])
    mask = torch.tensor([x.attention_mask for x in tqdm(batch, 'Attention Mask')])
    input_ids = labels.detach().clone()
    
    #default masking prob = 15%, don't mask special tokens 
    rand = torch.rand(input_ids.shape)
    mask_arr = (rand < mlm_prob) * (input_ids > 4)
    for i in tqdm(range(input_ids.shape[0]), 'Masking Words'):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        input_ids[i, selection] = 4
        
    # temp = input_ids.flatten()
    # percent = sum(temp == 4)/sum(labels.flatten() != 4)
    # print(percent)
    encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
    return encodings

In [49]:
encodings = mlm_pipe(batch)

Labels:   0%|          | 0/719533 [00:00<?, ?it/s]

Attention Mask:   0%|          | 0/719533 [00:00<?, ?it/s]

Masking Words:   0%|          | 0/719533 [00:00<?, ?it/s]

In [50]:
sum(sum(encodings['input_ids'] == 4)) / sum(sum(encodings['labels'] != 4))

tensor(0.1176)

In [51]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    
    def __getitem__(self, i):
        return {key : tensor[i] for key, tensor in self.encodings.items()}

In [52]:
d = Dataset(encodings)
del batch


In [54]:
loader = torch.utils.data.DataLoader(d, batch_size=384, pin_memory=True, shuffle=True)
del encodings

In [29]:
config = BertConfig(vocab_size=30500,num_hidden_layers=12)
model = BertForMaskedLM(config)

In [30]:
device = 'cuda:0' 
if torch.cuda.device_count() > 1:
    model = DataParallel(model)
    model.to(device)

In [31]:
model.device_ids

[0, 1, 2, 3]

In [55]:
# model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

def save_model(path: './', multiple_gpu: bool=True):
    if multiple_gpu:
        torch.save({'epoch': epoch,
                    'model_state_dict': model.module.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss}, 
              f'{path}model_{step}.pt')
    else:
        torch.save({'epoch': epoch, 
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss}, 
              f'{path}model_{step}.pt')

In [58]:
from math import floor

def run():
    num_batches = len(loader)
    epochs = 1
    step = 0
    
    for epoch in range(epochs):
        # for file in files[:2]:
        #     # setup loop with TQDM and dataloader
        #     results = load_data_from_disk(os.path.join(vm_data, file))
        #     batch = tokenizer.encode_batch(results)
        #     del results
        #     encodings = mlm_pipe(batch)
        #     del batch
        #     d = Dataset(encodings)
        #     del encodings
        #     loader = torch.utils.data.DataLoader(d, batch_size=384, pin_memory=True, shuffle=True)

        loop = tqdm(loader, leave=True)
        for batch in loop:
            step += 1
            # initialize calculated gradients (from prev step)
            optimizer.zero_grad()
            # pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            labels=labels)
            # extract loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.sum().backward()
            # update parameters
            optimizer.step()
            # print relevant info to progress barI 
            loop.set_description(f'Epoch {epoch}')

            if step % floor(num_batches/10) == 0:
                model.module.save_pretrained(f'checkpoints/test_save_pretrained/model-trained-{step}.pt') 
            
        #loop.set_postfix(loss=loss.item())

In [59]:
run()

  0%|          | 0/1874 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [60]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Thu Mar 24 23:48:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.54       Driver Version: 510.54       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000001:00:00.0 Off |                    0 |
| N/A   37C    P0    33W / 250W |  16276MiB / 16384MiB |      0%      Default |
|                               |            

In [61]:
torch.cuda.empty_cache()

In [None]:
#OOP way to train model

training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=2,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=8, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=1,  # accumulating the gradients before updating the weight
    logging_steps=500,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=500,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

trainer = Trainer(model=model,
                  args=training_args,
                  data_collator=data_collator,
                  train_dataset=dataset)

In [4]:
def load_checkpoint(model, checkpoint_: Union[str, pathlib.Path], parallel=False):
    checkpoint = torch.load(checkpoint_)
    model_state = checkpoint['model_state_dict']
    opt_state = checkpoint['optimizer_state_dict']
    model_loss = checkpoint['loss']
    model.load_state_dict(model_state)
    if parallel:
        if torch.cuda.device_count() > 1:
            model = DataParallel(model)
    return model, opt_state, model_loss

In [6]:
new_model, opt, model_loss = load_checkpoint(model, 'checkpoints/final_model_18710.pt')

In [69]:
mask = alternative_tokenizer.mask_token

In [70]:
def show_results(text: str):
    config = BertConfig(vocab_size=30500, max_position_embeddings=514, num_hidden_layers=12)
    model = BertForMaskedLM(config)
    untrained_pipe = pipeline('fill-mask', model=model, tokenizer=alternative_tokenizer)
    utresult = untrained_pipe(text)
    
    print()
    print("Untrained Results")
    print("*" * 150)
    for result in utresult:
        print(result)
        
    trained_model, opt, model_loss = load_checkpoint(model, 'checkpoints/final_model_18710.pt')
    trained_pipe = pipeline('fill-mask', model=trained_model, tokenizer=alternative_tokenizer)

    tresult = trained_pipe(text)
    
    print()
    print("Trained Results")
    print("*" * 150)
    for result in tresult:
        print(result)
    

In [79]:
show_results(f'Introduction Under normal physiological conditions, all cells in the body are exposed chronically to oxidants from both {mask} and exogenous sources;')


Untrained Results
******************************************************************************************************************************************************
{'sequence': 'introduction under normal physiological conditions, all cells in the body are exposed chronically to oxidants from both fried and exogenous sources ;', 'score': 0.00021954214025754482, 'token': 13991, 'token_str': 'f r i e d'}
{'sequence': 'introduction under normal physiological conditions, all cells in the body are exposed chronically to oxidants from both languages and exogenous sources ;', 'score': 0.00021149273379705846, 'token': 14882, 'token_str': 'l a n g u a g e s'}
{'sequence': 'introduction under normal physiological conditions, all cells in the body are exposed chronically to oxidants from both mathematically and exogenous sources ;', 'score': 0.00019954019808210433, 'token': 27782, 'token_str': 'm a t h e m a t i c a l l y'}
{'sequence': 'introduction under normal physiological conditions, al

In [31]:
with open('../Data/subsets/xaasplit_25K') as f:
    data = [line.strip() for line in f.readlines()]

In [9]:
test = tokenizer.encode_batch(['Introduction Under normal physiological conditions, all cells in the body are exposed chronically to oxidants from both endogenous and exogenous sources;', 'This is another sentence.'])

In [10]:
input_ids = torch.tensor([x.ids for x in test])
mask = torch.tensor([x.attention_mask for x in test])


In [11]:
outputs = new_model(input_ids, mask)

In [15]:
outputs['logits'].shape

torch.Size([2, 25, 30500])

In [96]:
bbut = BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
new_model.save_pretrained('./')

In [18]:
new_model.get_output_embeddings()

Linear(in_features=768, out_features=30500, bias=True)

In [19]:
new_model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30500, 768, padding_idx=0)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [20]:
test = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
test

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [31]:
from transformers import BertModel, BertForMaskedLM, BertForPreTraining

In [29]:
bert = BertModel.from_pretrained('bert-base-uncased')
mlm = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
bert, mlm

(BertModel(
   (embeddings): BertEmbeddings(
     (word_embeddings): Embedding(30522, 768, padding_idx=0)
     (position_embeddings): Embedding(512, 768)
     (token_type_embeddings): Embedding(2, 768)
     (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )
   (encoder): BertEncoder(
     (layer): ModuleList(
       (0): BertLayer(
         (attention): BertAttention(
           (self): BertSelfAttention(
             (query): Linear(in_features=768, out_features=768, bias=True)
             (key): Linear(in_features=768, out_features=768, bias=True)
             (value): Linear(in_features=768, out_features=768, bias=True)
             (dropout): Dropout(p=0.1, inplace=False)
           )
           (output): BertSelfOutput(
             (dense): Linear(in_features=768, out_features=768, bias=True)
             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
             (dropout): Dropout(p=0.1, inp

In [32]:
bpt = BertForPreTraining.from_pretrained('bert-base-uncased')

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
bpt

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine