In [1]:
#standard imports
import os, time
from tqdm.auto import tqdm
from typing import List, Union
from re_sent_splitter import split_into_sentences
from pathlib import Path
import pathlib

#distributed imports
import torch
from torch.nn.parallel import DistributedDataParallel as DDP, DataParallel
from torch.utils.data import DistributedSampler, DataLoader

#tokenizers and datasets
from datasets import load_dataset
from tokenizers import BertWordPieceTokenizer 
from tokenizers.processors import TemplateProcessing
import tokenizers

#transformer imports
from transformers import BertTokenizer, DataCollatorForWholeWordMask, DataCollatorForLanguageModeling
from transformers import BertForMaskedLM, BertConfig, AdamW, TrainingArguments, Trainer
from transformers import pipeline

In [2]:
for d in range(4):
    print(torch.cuda.get_device_properties(d))

_CudaDeviceProperties(name='Tesla P100-PCIE-16GB', major=6, minor=0, total_memory=16280MB, multi_processor_count=56)
_CudaDeviceProperties(name='Tesla P100-PCIE-16GB', major=6, minor=0, total_memory=16280MB, multi_processor_count=56)
_CudaDeviceProperties(name='Tesla P100-PCIE-16GB', major=6, minor=0, total_memory=16280MB, multi_processor_count=56)
_CudaDeviceProperties(name='Tesla P100-PCIE-16GB', major=6, minor=0, total_memory=16280MB, multi_processor_count=56)


In [3]:
torch.cuda.device_count()

4

#### Set Tokenizer and Data paths

In [4]:
vm_tok_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
vm_data = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/subsets/'
checkpoint_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/'
files = [f for f in os.listdir(vm_data) if f.endswith('25K')]
files
#local paths
# local_tok_path = '/Users/americanthinker1/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
# local_data = '/Users/americanthinker1/aws_data/processed_data/processed_chunks/english_docs_aa.txt'

['xabsplit_25K', 'xadsplit_25K', 'xacsplit_25K', 'xaasplit_25K']

#### Instantiate pretrained tokenizer from file

In [5]:
alternative_tokenizer = BertTokenizer.from_pretrained('../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt')

tokenizer = BertWordPieceTokenizer('../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt', strip_accents=True, lowercase=True)
tokenizer.enable_truncation(max_length=50)
tokenizer.enable_padding()
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ("[MASK]", tokenizer.token_to_id("[MASK]"))
    ],
)



In [5]:
#tokenizer.save_model('../Preprocessing/Tokenization/', prefix='BWPTokenizer')

#### Load data from local
Data is a 98,000 line file with each line representing one document of length ~12,000 characters from PubMed articles

In [6]:
#load data from disk
def load_data_from_disk(path: str, sample_size=None, min_tokens_per_sent: int=4) -> List[str]:
    '''
    Utility data loading function that performs the following operations:
       1. Loads data from disk into a list. Assumes each doc is one line.
       2. Performs sentence splitting on each document.
       3. Removes all sentences with tokens < 4 (default).
       4. Returns a list of sentences 
    '''
    #load data
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]

    #split data into sentences
    sentences = [split_into_sentences(i) for i in tqdm(lines, 'Sentence Splitter')]
    
    #remove all sentences with less than 5 tokens
    all_sentences = []
    for doc in tqdm(sentences, 'Filter Senteces'):
        for sentence in doc:
            if len(sentence.split()) > 4:
                all_sentences.append(sentence)
    print(f'Return a list of {len(all_sentences)} sentences')
    
    return all_sentences

In [33]:
results = load_data_from_disk(os.path.join(vm_data, files[0]), sample_size=1000)

Sentence Splitter:   0%|          | 0/1000 [00:00<?, ?it/s]

Filter Senteces:   0%|          | 0/1000 [00:00<?, ?it/s]

Return a list of 71680 sentences


#### Batch encode a chunk of data

In [34]:
s = time.perf_counter()
batch = tokenizer.encode_batch(results)
e = time.perf_counter() - s
print(round(e,2), 'seconds')


1.16 seconds


In [35]:
#decrease load on memory
del results

#### Create pipeline for random masking of 15% of input tokens

In [73]:
def mlm_pipe(batch: List[tokenizers.Encoding], mlm_prob=0.20) -> dict:
    '''
    Given a single instance from a batch of encodings, return masked inputs and associated arrays.
    Converts tokenizer.Encoding into a pytorch tensor.
    '''
    
    labels = torch.tensor([x.ids for x in tqdm(batch, 'Labels')], dtype=torch.int16)
    mask = torch.tensor([x.attention_mask for x in tqdm(batch, 'Attention Mask')], dtype=torch.int16)
    input_ids = labels.detach().clone()
    
    #default masking prob = 15%, don't mask special tokens 
    rand = torch.rand(input_ids.shape)
    mask_arr = (rand < mlm_prob) * (input_ids > 4)
    for i in tqdm(range(input_ids.shape[0]), 'Masking Words'):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        input_ids[i, selection] = 4
        
    # temp = input_ids.flatten()
    # percent = sum(temp == 4)/sum(labels.flatten() != 4)
    # print(percent)
    encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
    return encodings

In [89]:
encodings = mlm_pipe(batch)

Labels:   0%|          | 0/71680 [00:00<?, ?it/s]

Attention Mask:   0%|          | 0/71680 [00:00<?, ?it/s]

Masking Words:   0%|          | 0/71680 [00:00<?, ?it/s]

In [90]:
sum(sum(encodings['input_ids'] == 4)) / sum(sum(encodings['labels'] != 4))

tensor(0.1184)

In [91]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    
    def __getitem__(self, i):
        return {key : tensor[i] for key, tensor in self.encodings.items()}

In [92]:
d = Dataset(encodings)
del batch

In [93]:
loader = torch.utils.data.DataLoader(d, batch_size=384, pin_memory=True, shuffle=True)

In [5]:
config = BertConfig(vocab_size=30500, max_position_embeddings=514, num_hidden_layers=12)
model = BertForMaskedLM(config)

In [70]:
#device = torch
if torch.cuda.device_count() > 1:
    model = DataParallel(model)
    model.to(device)

In [40]:
model.device_ids

[0, 1, 2, 3]

In [41]:
# model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

def save_model(path: './', multiple_gpu: bool=True):
    if multiple_gpu:
        torch.save({'epoch': epoch,
                    'model_state_dict': model.module.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss}, 
              f'{path}model_{step}.pt')
    else:
        torch.save({'epoch': epoch, 
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss}, 
              f'{path}model_{step}.pt')

In [None]:
from math import floor

num_batches = len(loader)
epochs = 2
step = 0

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        step += 1
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.sum().backward()
        # update parameters
        optim.step()
        # print relevant info to progress barI 
        loop.set_description(f'Epoch {epoch}')
        
        if step % floor(num_batches/10) == 0:
            save_model()
            
        #loop.set_postfix(loss=loss.item())

  0%|          | 0/9355 [00:00<?, ?it/s]



In [39]:
torch.cuda.empty_cache()

In [19]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Wed Mar 23 01:19:19 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.54       Driver Version: 510.54       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000001:00:00.0 Off |                    0 |
| N/A   31C    P0    26W / 250W |      2MiB / 16384MiB |      0%      Default |
|                               |            

In [None]:
#OOP way to train model

training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=2,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=8, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=1,  # accumulating the gradients before updating the weight
    logging_steps=500,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=500,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

trainer = Trainer(model=model,
                  args=training_args,
                  data_collator=data_collator,
                  train_dataset=dataset)

In [14]:
def load_checkpoint(model, checkpoint_: Union[str, pathlib.Path], parallel=False):
    checkpoint = torch.load(checkpoint_)
    model_state = checkpoint['model_state_dict']
    opt_state = checkpoint['optimizer_state_dict']
    model_loss = checkpoint['loss']
    model.load_state_dict(model_state)
    if parallel:
        if torch.cuda.device_count() > 1:
            model = DataParallel(model)
    return model, opt_state, model_loss

In [25]:
mask = alternative_tokenizer.mask_token

In [27]:
def show_results(text: str):
    config = BertConfig(vocab_size=30500, max_position_embeddings=514, num_hidden_layers=12)
    model = BertForMaskedLM(config)
    untrained_pipe = pipeline('fill-mask', model=model, tokenizer=alternative_tokenizer)
    utresult = untrained_pipe(text)
    
    print()
    print("Untrained Results")
    print("*" * 150)
    for result in utresult:
        print(result)
        
    trained_model, opt, model_loss = load_checkpoint(model, 'checkpoints/final_model_18710.pt')
    trained_pipe = pipeline('fill-mask', model=trained_model, tokenizer=alternative_tokenizer)

    tresult = trained_pipe(text)
    
    print()
    print("Trained Results")
    print("*" * 150)
    for result in tresult:
        print(result)
    

In [38]:
show_results(f'Pomegranate is a popular fruit grown in {mask} with a large annual production rate.')


Untrained Results
******************************************************************************************************************************************************
{'sequence': 'pomegranate is a popular fruit grown inaked with a large annual production rate.', 'score': 0.00029260278097353876, 'token': 12730, 'token_str': '# # a k e d'}
{'sequence': 'pomegranate is a popular fruit grown inouracil with a large annual production rate.', 'score': 0.0002666855289135128, 'token': 20742, 'token_str': '# # o u r a c i l'}
{'sequence': 'pomegranate is a popular fruit grown in paradig with a large annual production rate.', 'score': 0.0002603462780825794, 'token': 8810, 'token_str': 'p a r a d i g'}
{'sequence': 'pomegranate is a popular fruit grown in −x with a large annual production rate.', 'score': 0.0002488254103809595, 'token': 27163, 'token_str': '− x'}
{'sequence': 'pomegranate is a popular fruit grown in specially with a large annual production rate.', 'score': 0.000233137776376679

In [31]:
with open('../Data/subsets/xaasplit_25K') as f:
    data = [line.strip() for line in f.readlines()]

In [32]:
data[0]

'Introduction Under normal physiological conditions, all cells in the body are exposed chronically to oxidants from both endogenous and exogenous sources; yet the intracellular “redox buffer” mechanism provides significant protection mainly by the antioxidant network [1]. Disturbance in the prooxidant- antioxidant balance in favor of the former leads to what is known as oxidative stress [2]. This oxidative stress and reactive oxygen species (ROS) can cause damage to DNA, proteins and lipids an d end up with an epidemic of non communicable chronic human diseases [3–5]. The prevalence of NCD are at escalating in Egypt due to activation of 64 genes involved in inflammation [6, 7] and other modifiable risk factors [8]. Medical and pharmacologic chemotherapeutic agents were reported to reduce cardio vascular mortality among individuals at risk, but they may induce oxidative stress, which increases to an invasive stage with disease progression [9]. Plant polyphenols possess the ideal chemica