In [1]:
#standard imports
import os, time, json, datetime, pytz
from tqdm.auto import tqdm
from typing import List, Union, Dict
from re_sent_splitter import split_into_sentences
from pathlib import Path
import pathlib
import numpy as np
from math import floor

#distributed imports
import torch
from torch.nn.parallel import DistributedDataParallel as DDP, DataParallel
import deepspeed

#tokenizers and datasets
from tokenizers import BertWordPieceTokenizer 
from tokenizers.processors import TemplateProcessing
import tokenizers

#transformer imports
from transformers import BertTokenizer
from transformers import BertForMaskedLM, BertConfig, AdamW
from transformers import pipeline

In [2]:
for d in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(d))

_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)


In [3]:
!nvidia-smi

Tue Mar 29 02:25:45 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.54       Driver Version: 510.54       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000001:00:00.0 Off |                    0 |
| N/A   38C    P0    56W / 300W |      3MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000002:00:00.0 Off |                    0 |
| N/A   43C    P0    55W / 300W |      3MiB / 32768MiB |      0%      Default |
|       

#### Set Tokenizer and Data paths

In [5]:
tokenizer_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
text_data_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/'
encodings_data_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/Encodings/encodings_0_395390.pt'
working_data =  'combined_4Gb.txt'

files = [f for f in os.listdir(text_data_path) if os.path.isfile(os.path.join(text_data_path, f))]
files

['small_sample_10000.txt', 'combined_4Gb.txt']

#### Instantiate pretrained tokenizer from file

In [6]:
alternative_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

tokenizer = BertWordPieceTokenizer(tokenizer_path, strip_accents=True, lowercase=True)
tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding()
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ("[MASK]", tokenizer.token_to_id("[MASK]"))
    ],
)



#### Load data from local
Data is a 98,000 line file with each line representing one document of length ~12,000 characters from PubMed articles

In [7]:
#load data from disk
def load_data_from_disk(path: str, sample_size:int=None, min_tokens_per_sent: int=4) -> List[str]:
    '''
    Utility data loading function that performs the following operations:
       1. Loads data from disk into a list. Assumes each doc is one line.
       2. Performs sentence splitting on each document.
       3. Removes all sentences with tokens < 4 (default).
       4. Returns a list of sentences 
    '''
    #load data
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]

    #split data into sentences
    sentences = [split_into_sentences(i) for i in tqdm(lines, 'Sentence Splitter')]
    
    #remove all sentences with less than 5 tokens
    all_sentences = []
    for doc in tqdm(sentences, 'Filter Senteces'):
        for sentence in doc:
            if len(sentence.split()) > 4:
                all_sentences.append(sentence)
    print(f'Return a list of {len(all_sentences)} sentences')
    
    return all_sentences

In [8]:
def load_data_seq_512(path: str, sample_size:int=None) -> List[str]:
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]
    
    return lines

In [10]:
#results = load_data_from_disk(os.path.join(vm_data, ))
start = time.perf_counter()
results = load_data_seq_512(os.path.join(text_data_path, files[1]))
end = time.perf_counter() - start
print(round(end, 2))

94.25


In [11]:
len(results)

395390

In [10]:
# segments = []
# temp_holder = []
# def create_512_sequences(
# for sentence in tqdm(results, 'Tokenizing'):
#     tokens = alternative_tokenizer.encode(sentence)
#     if len(temp_holder) + len(tokens) < 512:
#         temp_holder.extend(tokens[1:-1])
#     else:
#         temp_holder.insert(0,2)
#         segments.append(temp_holder + [3])
#         temp_holder = []
        

#### Batch encode a chunk of data

In [12]:
s = time.perf_counter()
batch = tokenizer.encode_batch(results)
e = time.perf_counter() - s
print(round(e,2), 'seconds')


298.27 seconds


In [13]:
#decrease load on memory
del results

#### Create pipeline for random masking of 15% of input tokens

In [14]:
def mlm_pipe(batch: List[tokenizers.Encoding], mlm_prob=0.15) -> dict:
    '''
    Given a single instance from a batch of encodings, return masked inputs and associated arrays.
    Converts tokenizer.Encoding into a pytorch tensor.
    '''
    
    labels = torch.tensor([x.ids for x in tqdm(batch, 'Labels')])
    mask = torch.tensor([x.attention_mask for x in tqdm(batch, 'Attention Mask')])
    input_ids = labels.detach().clone()
    
    #default masking prob = 15%, don't mask special tokens 
    rand = torch.rand(input_ids.shape)
    mask_arr = (rand < mlm_prob) * (input_ids > 4)
    for i in tqdm(range(input_ids.shape[0]), 'Masking Words'):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        input_ids[i, selection] = 4
        
    # temp = input_ids.flatten()
    # percent = sum(temp == 4)/sum(labels.flatten() != 4)
    # print(percent)
    encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
    return encodings

#### Load Encodings from disk

In [15]:
#encodings = mlm_pipe(batch)
encodings = torch.load(encodings_data_path)

In [16]:
sum(sum(encodings['input_ids'] == 4)) / sum(sum(encodings['labels'] != 4))

tensor(0.1490)

In [18]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    
    def __getitem__(self, i):
        return {key : tensor[i] for key, tensor in self.encodings.items()}

In [19]:
d = Dataset(encodings)
del batch


In [20]:
loader = torch.utils.data.DataLoader(d, batch_size=112, pin_memory=True, shuffle=True)
len(loader)

3531

In [21]:
config = BertConfig(vocab_size=30500,num_hidden_layers=12)
model = BertForMaskedLM(config)

In [22]:
device = 'cuda:0' 
if torch.cuda.device_count() > 1:
    model = DataParallel(model)
    model.to(device)

In [23]:
model.device_ids

[0, 1, 2, 3, 4, 5, 6, 7]

In [24]:
!nvidia-smi


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tue Mar 29 02:38:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.54       Driver Version: 510.54       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000001:00:00.0 Off |                    0 |
| N/A   36C    P0    56W / 300W |   1854MiB / 32768MiB |      0%      Default |
|                               |            

In [25]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

def run():
    num_batches = len(loader)
    epochs = 50
    step = 0
    lowest_loss = 10000
    tolerance = 0.01
    
    model.train()
    
    for epoch in range(epochs):

        loop = tqdm(loader, leave=True)
        for batch in loop:
            step += 1
            # initialize calculated gradients (from prev step)
            optimizer.zero_grad()
            # pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            labels=labels)
            # extract loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.sum().backward()
            # update parameters
            optimizer.step()
            # print relevant info to progress barI 
            loop.set_description(f'Epoch {epoch}')
            
            loss_check = floor(num_batches/10)
            checkpoint = floor(num_batches/2)
            
            if step % loss_check == 0:
                print(f'Loss: {loss.sum()}')
            
        model.module.save_pretrained(f'checkpoints/run_4GB_Mar28/model-trained-{epoch}-{step}.pt') 
            
        #loop.set_postfix(loss=loss.item())

In [26]:
run()

  0%|          | 0/3531 [00:00<?, ?it/s]



Loss: 8.855552673339844


KeyboardInterrupt: 

In [19]:
!nvidia-smi

Failed to initialize NVML: Driver/library version mismatch


In [20]:
torch.cuda.empty_cache()

In [21]:
!nvidia-smi

Failed to initialize NVML: Driver/library version mismatch


In [94]:
mask = alternative_tokenizer.mask_token

In [106]:
def show_results(text: str):
    config = BertConfig(vocab_size=30500)
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    untrained_pipe = pipeline('fill-mask', model=model, tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'))
    utresult = untrained_pipe(text)
    
    print()
    print("Untrained Results")
    print("*" * 150)
    for result in utresult:
        print(result['sequence'], result['score'])
        
    lm = BertForMaskedLM.from_pretrained('checkpoints/test_save_pretrained/model-trained-14000.pt/')
    trained_pipe = pipeline('fill-mask', model=lm, tokenizer=alternative_tokenizer)

    tresult = trained_pipe(text)
    
    print()
    print("Trained Results")
    print("*" * 150)
    for result in tresult:
        print(result['sequence'], result['score'])
    

In [107]:
show_results(f'{mask} is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves.')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]


Untrained Results
******************************************************************************************************************************************************
it is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.7136348485946655
insulin is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.014016539789736271
amp is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.01275827456265688
cox is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.012082782573997974
notch is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.01053623203188181

Trained Results
****************************************************************************************************************

In [31]:
with open('../Data/subsets/xaasplit_25K') as f:
    data = [line.strip() for line in f.readlines()]

In [9]:
test = tokenizer.encode_batch(['Introduction Under normal physiological conditions, all cells in the body are exposed chronically to oxidants from both endogenous and exogenous sources;', 'This is another sentence.'])

In [10]:
input_ids = torch.tensor([x.ids for x in test])
mask = torch.tensor([x.attention_mask for x in test])


In [11]:
outputs = new_model(input_ids, mask)

In [15]:
outputs['logits'].shape

torch.Size([2, 25, 30500])

In [96]:
bbut = BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
new_model.save_pretrained('./')

In [18]:
new_model.get_output_embeddings()

Linear(in_features=768, out_features=30500, bias=True)

In [19]:
new_model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30500, 768, padding_idx=0)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [20]:
test = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
test

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [31]:
from transformers import BertModel, BertForMaskedLM, BertForPreTraining

In [29]:
bert = BertModel.from_pretrained('bert-base-uncased')
mlm = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
from deepspeed_pretrain_bert import masking_function

In [9]:
masking_function(text="""
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
""", 
                 tokenizer=alternative_tokenizer, 
                 mask_prob=0.15, 
                 random_replace_prob=0.1, 
                 unmask_replace_prob=0.1,
                 max_length=512)

([None,
  2420,
  8747,
  1641,
  1634,
  2065,
  14042,
  1737,
  22511,
  17,
  4,
  17,
  4,
  4,
  1696,
  1851,
  1897,
  2290,
  3548,
  3949,
  4,
  1831,
  4,
  1633,
  27604,
  30,
  37,
  4,
  1811,
  4,
  18,
  6791,
  41,
  3259,
  18,
  2976,
  11,
  4,
  11,
  1811,
  4,
  18,
  26897,
  41,
  3259,
  18,
  5150,
  11,
  39,
  17,
  1784,
  1687,
  4178,
  2415,
  5993,
  1740,
  3548,
  3949,
  4,
  1831,
  3253,
  1633,
  27604,
  1746,
  1634,
  14042,
  1641,
  43,
  2065,
  6751,
  1702,
  4,
  3792,
  1726,
  1677,
  3609,
  8452,
  12,
  47,
  18,
  49,
  18,
  3548,
  3949,
  43,
  22511,
  1769,
  19560,
  6477,
  4,
  1654,
  16904,
  2728,
  2065,
  1746,
  43,
  22511,
  1769,
  3241,
  16302,
  3965,
  2065,
  13,
  18,
  17,
  1784,
  1687,
  1851,
  4178,
  4,
  5993,
  1740,
  3548,
  4,
  22511,
  1831,
  3253,
  1633,
  27604,
  4,
  1634,
  14042,
  1641,
  43,
  2065,
  4,
  5993,
  6547,
  1656,
  1710,
  14045,
  6794,
  12,
  3762,
  3949,
  43,
  2

In [35]:
def get_unique_identifier(length: int = 8) -> str:
    """Create a unique identifier by choosing `length`
    random characters from list of ascii characters and numbers
    """
    alphabet = string.ascii_lowercase + string.digits
    uuid = "".join(alphabet[ix] for ix in np.random.choice(len(alphabet), length))
    return uuid

def create_experiment_dir(
        checkpoint_dir: pathlib.Path, all_arguments: Dict
) -> pathlib.Path:
    """ Create an experiment directory and save all arguments in it."""
    current_time = datetime.datetime.now(pytz.timezone("US/Pacific"))
    expname = f"bert_pretrain.{current_time.year}.{current_time.month}.{current_time.day}.{current_time.hour}.{current_time.minute}.{current_time.second}.{get_unique_identifier()}"
    exp_dir = checkpoint_dir / expname
    exp_dir.mkdir(exist_ok=False)
    hparams_file = exp_dir / "hparams.json"
    with hparams_file.open("w") as handle:
        json.dump(obj=all_arguments, fp=handle, indent=2)

    # Create the Tensorboard Dir
    tb_dir = exp_dir / "tb_dir"
    tb_dir.mkdir()
    return exp_dir

In [14]:
hparams = {"train_file": '',
        "validation_file": "",
        "mask_prob": 0.15,
        "epoch": 10,
        "batch_size":32,
        "checkpoint_every":1000,
        "learning_rate":1e-5,
        "weight_decay":0.001,
        "gradient_accumulation_steps":1,
        "lr_scheduler_type":'linear',
        "num_warmup_steps":1000,
        "seed":42}

In [21]:
with open('hparams.json', 'w') as f:
    f.write(json.dumps(hparams))

In [None]:
# model.train()

def save_model(path: './', multiple_gpu: bool=True):
    if multiple_gpu:
        torch.save({'epoch': epoch,
                    'model_state_dict': model.module.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss}, 
              f'{path}model_{step}.pt')
    else:
        torch.save({'epoch': epoch, 
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss}, 
              f'{path}model_{step}.pt')

In [22]:
36*72

2592

In [23]:
_/60

43.2