In [1]:
#standard imports
import os, time, json, datetime, pytz
from tqdm.auto import tqdm
from typing import List, Union, Dict
from re_sent_splitter import split_into_sentences
from pathlib import Path
import pathlib
from multiprocessing import Pool
import string
import numpy as np
from IPython import get_ipython

#distributed imports
import torch
from torch.nn.parallel import DistributedDataParallel as DDP, DataParallel
from torch.utils.data import DistributedSampler, DataLoader
import deepspeed
from transformers.deepspeed import HfDeepSpeedConfig

#tokenizers and datasets
from datasets import load_dataset
from tokenizers import BertWordPieceTokenizer 
from tokenizers.processors import TemplateProcessing
import tokenizers

#transformer imports
from transformers import BertTokenizer, DataCollatorForWholeWordMask, DataCollatorForLanguageModeling
from transformers import BertForMaskedLM, BertConfig, AdamW, TrainingArguments, Trainer
from transformers import pipeline

In [2]:
for d in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(d))

_CudaDeviceProperties(name='Tesla V100-PCIE-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-PCIE-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80)


#### Set Tokenizer and Data paths

In [3]:
vm_tok_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
vm_data = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/'
checkpoint_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/'
files = [f for f in os.listdir(vm_data) if os.path.isfile(os.path.join(vm_data, f))]
files
#local paths
# local_tok_path = '/Users/americanthinker1/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
# local_data = '/Users/americanthinker1/aws_data/processed_data/processed_chunks/english_docs_aa.txt'

['xac']

#### Instantiate pretrained tokenizer from file

In [4]:
alternative_tokenizer = BertTokenizer.from_pretrained('../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt')

tokenizer = BertWordPieceTokenizer('../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt', strip_accents=True, lowercase=True)
tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding()
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ("[MASK]", tokenizer.token_to_id("[MASK]"))
    ],
)



#### Load data from local
Data is a 98,000 line file with each line representing one document of length ~12,000 characters from PubMed articles

In [7]:
#load data from disk
def load_data_from_disk(path: str, sample_size:int=None, min_tokens_per_sent: int=4) -> List[str]:
    '''
    Utility data loading function that performs the following operations:
       1. Loads data from disk into a list. Assumes each doc is one line.
       2. Performs sentence splitting on each document.
       3. Removes all sentences with tokens < 4 (default).
       4. Returns a list of sentences 
    '''
    #load data
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]

    #split data into sentences
    sentences = [split_into_sentences(i) for i in tqdm(lines, 'Sentence Splitter')]
    
    #remove all sentences with less than 5 tokens
    all_sentences = []
    for doc in tqdm(sentences, 'Filter Senteces'):
        for sentence in doc:
            if len(sentence.split()) > 4:
                all_sentences.append(sentence)
    print(f'Return a list of {len(all_sentences)} sentences')
    
    return all_sentences

In [5]:
def load_data_seq_512(path: str, sample_size:int=None) -> List[str]:
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]
    
    return lines

In [6]:
#results = load_data_from_disk(os.path.join(vm_data, ))
start = time.perf_counter()
results = load_data_seq_512(os.path.join(vm_data, files[0]))
end = time.perf_counter() - start
print(round(end, 2))

0.26


In [7]:
len(results)

5000

In [57]:
# segments = []
# temp_holder = []
# def create_512_sequences(
# for sentence in tqdm(results, 'Tokenizing'):
#     tokens = alternative_tokenizer.encode(sentence)
#     if len(temp_holder) + len(tokens) < 512:
#         temp_holder.extend(tokens[1:-1])
#     else:
#         temp_holder.insert(0,2)
#         segments.append(temp_holder + [3])
#         temp_holder = []
        

#### Batch encode a chunk of data

In [8]:
s = time.perf_counter()
batch = tokenizer.encode_batch(results)
e = time.perf_counter() - s
print(round(e,2), 'seconds')


2.88 seconds


In [9]:
#decrease load on memory
del results

#### Create pipeline for random masking of 15% of input tokens

In [10]:
def mlm_pipe(batch: List[tokenizers.Encoding], mlm_prob=0.15) -> dict:
    '''
    Given a single instance from a batch of encodings, return masked inputs and associated arrays.
    Converts tokenizer.Encoding into a pytorch tensor.
    '''
    
    labels = torch.tensor([x.ids for x in tqdm(batch, 'Labels')])
    mask = torch.tensor([x.attention_mask for x in tqdm(batch, 'Attention Mask')])
    input_ids = labels.detach().clone()
    
    #default masking prob = 15%, don't mask special tokens 
    rand = torch.rand(input_ids.shape)
    mask_arr = (rand < mlm_prob) * (input_ids > 4)
    for i in tqdm(range(input_ids.shape[0]), 'Masking Words'):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        input_ids[i, selection] = 4
        
    # temp = input_ids.flatten()
    # percent = sum(temp == 4)/sum(labels.flatten() != 4)
    # print(percent)
    encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
    return encodings

In [11]:
encodings = mlm_pipe(batch)

Labels:   0%|          | 0/5000 [00:00<?, ?it/s]

Attention Mask:   0%|          | 0/5000 [00:00<?, ?it/s]

Masking Words:   0%|          | 0/5000 [00:00<?, ?it/s]

In [12]:
sum(sum(encodings['input_ids'] == 4)) / sum(sum(encodings['labels'] != 4))

tensor(0.1496)

In [13]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    
    def __getitem__(self, i):
        return {key : tensor[i] for key, tensor in self.encodings.items()}

In [14]:
d = Dataset(encodings)
del batch
BATCH_SIZE = 20

In [15]:
loader = torch.utils.data.DataLoader(d, batch_size=BATCH_SIZE, pin_memory=True, shuffle=True)
len(loader)

250

In [5]:
BATCH_SIZE = 20

ds_config = {
    "fp16": {
        "enabled": True,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 32,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "Adam",
        "params": {
          "lr": 0.001,
          "betas": [
            0.8,
            0.999
          ],
          "eps": 1e-8,
          "weight_decay": 3e-7
                    }
    },

     "scheduler": {
      "type": "WarmupLR",
      "params": {
          "warmup_min_lr": 0,
          "warmup_max_lr": 0.001,
          "warmup_num_steps": 1000
                }
    },
    
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        },
        "allgather_partitions": True,
        "allgather_bucket_size": 2e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": True
    },

    "gradient_accumulation_steps": 1,
    "gradient_clipping": 1.0,
    "train_batch_size": BATCH_SIZE
    #"train_micro_batch_size_per_gpu":BATCH_SIZE
    }
hfds_config = HfDeepSpeedConfig(ds_config)

In [6]:
config = BertConfig(vocab_size=30500,num_hidden_layers=12)
model = BertForMaskedLM(config)

In [89]:
device = 'cuda:0' 
if torch.cuda.device_count() > 1:
    model = DataParallel(model)
    model.to(device)

In [8]:
engine = deepspeed.initialize(model=model,
                              model_parameters=model.parameters(), 
                              config=ds_config)

[2022-03-26 19:21:56,991] [INFO] [logging.py:69:log_dist] [Rank -1] DeepSpeed info: version=0.6.0, git-hash=unknown, git-branch=unknown
[2022-03-26 19:21:56,993] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment...
--------------------------------------------------------------------------
[[44922,1],0]: A high-performance Open MPI point-to-point messaging module
was unable to find any relevant network interfaces:

Module: OpenFabrics (openib)
  Host: bert-pretraining-vm

Another transport will be used instead, although this may result in
lower performance.

btl_base_warn_component_unused to 0.
--------------------------------------------------------------------------
[2022-03-26 19:21:57,232] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=10.2.0.5, master_port=29500
[2022-03-26 19:21:57,234] [INFO] [distributed.py:46:init_dist

In [23]:
m = engine[0]

In [26]:
m.save_16bit_model('./checkpoints/', save_filename='pytorch_model_test_deepspeed.bin')

[2022-03-26 19:29:22,721] [INFO] [engine.py:3112:save_16bit_model] Saving model weights to ./checkpoints/pytorch_model_test_deepspeed.bin


True

In [86]:
torch.cuda.empty_cache()
!nvidia-smi


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Mar 25 17:18:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.54       Driver Version: 510.54       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000001:00:00.0 Off |                  Off |
| N/A   31C    P0    39W / 250W |   1983MiB / 16384MiB |      0%      Default |
|                               |            

In [55]:
stop_counts = []

def early_stopping(total_loss, tol: float = 0.01, early_stopping: int=10):
    if total_loss < lowest_loss:
        lowest_loss = total_loss
        
    
    

In [24]:
from math import floor
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

def run():
    num_batches = len(loader)
    epochs = 3
    step = 0
    lowest_loss = 10000
    tolerance = 0.01
    
    model.train()
    
    for epoch in range(epochs):
        # for file in files[:2]:
        #     # setup loop with TQDM and dataloader
        #     results = load_data_from_disk(os.path.join(vm_data, file))
        #     batch = tokenizer.encode_batch(results)
        #     del results
        #     encodings = mlm_pipe(batch)
        #     del batch
        #     d = Dataset(encodings)
        #     del encodings
        #     loader = torch.utils.data.DataLoader(d, batch_size=384, pin_memory=True, shuffle=True)

        loop = tqdm(loader, leave=True)
        for batch in loop:
            step += 1
            # initialize calculated gradients (from prev step)
            optimizer.zero_grad()
            # pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            labels=labels)
            # extract loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.sum().backward()
            # update parameters
            optimizer.step()
            # print relevant info to progress barI 
            loop.set_description(f'Epoch {epoch}')
                
            if step % 1000 == 0:
                print(f'Loss: {loss.sum()}')
                
        model.module.save_pretrained(f'checkpoints/test_save_pretrained/model-trained-{step}.pt') 
            
        #loop.set_postfix(loss=loss.item())

In [92]:
run()

  0%|          | 0/4944 [00:00<?, ?it/s]



Loss: 2.1615476608276367
Loss: 2.114736557006836
Loss: 2.025595188140869
Loss: 2.143763780593872


  0%|          | 0/4944 [00:00<?, ?it/s]

Loss: 1.9414606094360352
Loss: 1.926030158996582
Loss: 1.8185341358184814
Loss: 1.5966253280639648
Loss: 1.2994054555892944


  0%|          | 0/4944 [00:00<?, ?it/s]

Loss: 1.251365303993225
Loss: 1.2248347997665405
Loss: 1.126413106918335
Loss: 1.0757315158843994
Loss: 0.9106765985488892


In [108]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Mar 25 20:00:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.54       Driver Version: 510.54       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000001:00:00.0 Off |                  Off |
| N/A   30C    P0    35W / 250W |  15641MiB / 16384MiB |      0%      Default |
|                               |            

In [109]:
torch.cuda.empty_cache()

In [110]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Mar 25 20:01:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.54       Driver Version: 510.54       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000001:00:00.0 Off |                  Off |
| N/A   31C    P0    39W / 250W |   3701MiB / 16384MiB |      0%      Default |
|                               |            

In [94]:
mask = alternative_tokenizer.mask_token

In [106]:
def show_results(text: str):
    config = BertConfig(vocab_size=30500)
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    untrained_pipe = pipeline('fill-mask', model=model, tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'))
    utresult = untrained_pipe(text)
    
    print()
    print("Untrained Results")
    print("*" * 150)
    for result in utresult:
        print(result['sequence'], result['score'])
        
    lm = BertForMaskedLM.from_pretrained('checkpoints/test_save_pretrained/model-trained-14000.pt/')
    trained_pipe = pipeline('fill-mask', model=lm, tokenizer=alternative_tokenizer)

    tresult = trained_pipe(text)
    
    print()
    print("Trained Results")
    print("*" * 150)
    for result in tresult:
        print(result['sequence'], result['score'])
    

In [107]:
show_results(f'{mask} is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves.')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]


Untrained Results
******************************************************************************************************************************************************
it is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.7136348485946655
insulin is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.014016539789736271
amp is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.01275827456265688
cox is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.012082782573997974
notch is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.01053623203188181

Trained Results
****************************************************************************************************************

In [31]:
with open('../Data/subsets/xaasplit_25K') as f:
    data = [line.strip() for line in f.readlines()]

In [9]:
test = tokenizer.encode_batch(['Introduction Under normal physiological conditions, all cells in the body are exposed chronically to oxidants from both endogenous and exogenous sources;', 'This is another sentence.'])

In [10]:
input_ids = torch.tensor([x.ids for x in test])
mask = torch.tensor([x.attention_mask for x in test])


In [11]:
outputs = new_model(input_ids, mask)

In [15]:
outputs['logits'].shape

torch.Size([2, 25, 30500])

In [96]:
bbut = BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
new_model.save_pretrained('./')

In [18]:
new_model.get_output_embeddings()

Linear(in_features=768, out_features=30500, bias=True)

In [27]:
from transformers import BertModel

In [30]:
bert = BertModel.from_pretrained('./checkpoints/deepspeed_model/')

Some weights of the model checkpoint at ./checkpoints/deepspeed_model/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ./checkpoints/deepspeed_model/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.poo

In [31]:
bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30500, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          