In [43]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertForMaskedLM, BertConfig, AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
import tokenizers
from tokenizers import BertWordPieceTokenizer
from tokenizers.processors import TemplateProcessing
from typing import List
import deepspeed
import loguru
import nvgpu
import os, time
from tqdm import tqdm
import json

In [29]:
logger = loguru.logger

for gpu in nvgpu.gpu_info():
    logger.info(gpu)
    
local_rank = 0
device = (
        torch.device("cuda", local_rank)
        if (local_rank > -1) and torch.cuda.is_available()
        else torch.device("cpu")
    )
tokenizer_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
text_data_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/'

files = [f for f in os.listdir(text_data_path) if os.path.isfile(os.path.join(text_data_path, f))]
logger.info(f'{files}')

# #### Instantiate pretrained tokenizer from file
alternative_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

tokenizer = BertWordPieceTokenizer(tokenizer_path, strip_accents=True, lowercase=True)
tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding()
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ("[MASK]", tokenizer.token_to_id("[MASK]"))
    ],
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2022-03-30 01:22:30.348 | INFO     | __main__:<module>:4 - {'index': '0', 'type': 'Tesla V100-PCIE-16GB', 'uuid': 'GPU-a977becb-feef-b927-7929-3a141db5315d', 'mem_used': 4, 'mem_total': 16384, 'mem_used_percent': 0.0244140625}
2022-03-30 01:22:30.349 | INFO     | __main__:<module>:4 - {'index': '1', 'type': 'Tesla V100-PCIE-16GB', 'uuid': 'GPU-90baff9b-3e43-3db5-b771-b26da4945020', 'mem_used': 4, 'mem_total': 16384, 'mem_used_percent': 0.0244140625}
2022-03-30 01:22:30.351 | INFO     | __main__:<module>:16 - ['xac']


In [30]:
def load_data_seq_512(path: str, sample_size:int=None) -> List[str]:
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]
    
    return lines

def mlm_pipe(batch: List[tokenizers.Encoding], mlm_prob=0.15) -> dict:
    '''
    Given a single instance from a batch of encodings, return masked inputs and associated arrays.
    Converts tokenizer.Encoding into a pytorch tensor.
    '''
    
    labels = torch.tensor([x.ids for x in tqdm(batch, 'Labels')])
    mask = torch.tensor([x.attention_mask for x in tqdm(batch, 'Attention Mask')])
    input_ids = labels.detach().clone()
    
    #default masking prob = 15%, don't mask special tokens 
    rand = torch.rand(input_ids.shape)
    mask_arr = (rand < mlm_prob) * (input_ids > 4)
    for i in tqdm(range(input_ids.shape[0]), 'Masking Words'):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        input_ids[i, selection] = 4
        
    # temp = input_ids.flatten()
    # percent = sum(temp == 4)/sum(labels.flatten() != 4)
    # print(percent)
    encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
    return encodings


In [31]:
logger.info('Loading data from disk into memory...')
start = time.perf_counter()
results = load_data_seq_512(os.path.join(text_data_path, files[0]))
end = time.perf_counter() - start
logger.info(f'Loading completed: {round(end, 2)} seconds to load {len(results)} lines/documents.')

logger.info('Batch encoding data...')
s = time.perf_counter()
batch = tokenizer.encode_batch(results)
e = time.perf_counter() - s
logger.info(f'Batch encoding completed, took {round(e,2)} seconds to complete')

del results

logger.info('Masking tokens')
encodings = mlm_pipe(batch)
logger.info('Masking tokens completed')
#encodings = torch.load(encodings_data_path)

del batch

percent = sum(sum(encodings['input_ids'].detach().numpy() == 4)) / sum(sum(encodings['labels'].detach().numpy() != 4))
logger.info(f'Total of {round(percent * 100,2)}% of tokens are masked.')



2022-03-30 01:22:31.705 | INFO     | __main__:<module>:1 - Loading data from disk into memory...
2022-03-30 01:22:31.957 | INFO     | __main__:<module>:5 - Loading completed: 0.25 seconds to load 5000 lines/documents.
2022-03-30 01:22:31.958 | INFO     | __main__:<module>:7 - Batch encoding data...
2022-03-30 01:22:35.098 | INFO     | __main__:<module>:11 - Batch encoding completed, took 3.14 seconds to complete
2022-03-30 01:22:35.103 | INFO     | __main__:<module>:15 - Masking tokens
Labels: 100%|██████████| 5000/5000 [00:00<00:00, 38148.97it/s]
Attention Mask: 100%|██████████| 5000/5000 [00:00<00:00, 125266.67it/s]
Masking Words: 100%|██████████| 5000/5000 [00:00<00:00, 30068.42it/s]
2022-03-30 01:22:35.740 | INFO     | __main__:<module>:17 - Masking tokens completed
2022-03-30 01:22:36.300 | INFO     | __main__:<module>:23 - Total of 14.92% of tokens are masked.


In [32]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    
    def __getitem__(self, i):
        return {key : tensor[i] for key, tensor in self.encodings.items()}


d = Dataset(encodings)
BATCH_SIZE = 16
loader = torch.utils.data.DataLoader(d, batch_size=BATCH_SIZE, pin_memory=True, shuffle=True)

In [42]:
with open('zero2_config.json') as f:
    ds_config = json.loads(f.read())
ds_config

{'fp16': {'enabled': 'auto',
  'loss_scale': 0,
  'loss_scale_window': 1000,
  'initial_scale_power': 16,
  'hysteresis': 2,
  'min_loss_scale': 1},
 'optimizer': {'type': 'AdamW',
  'params': {'lr': 'auto',
   'betas': 'auto',
   'eps': 'auto',
   'weight_decay': 'auto'}},
 'scheduler': {'type': 'WarmupLR',
  'params': {'warmup_min_lr': 'auto',
   'warmup_max_lr': 'auto',
   'warmup_num_steps': 'auto'}},
 'zero_optimization': {'stage': 2,
  'offload_optimizer': {'device': 'cpu', 'pin_memory': True},
  'allgather_partitions': True,
  'allgather_bucket_size': 200000000.0,
  'overlap_comm': True,
  'reduce_scatter': True,
  'reduce_bucket_size': 200000000.0,
  'contiguous_gradients': True},
 'gradient_accumulation_steps': 'auto',
 'gradient_clipping': 'auto',
 'steps_per_print': 2000,
 'train_batch_size': 'auto',
 'train_micro_batch_size_per_gpu': 'auto',
 'wall_clock_breakdown': False}

In [44]:
bert_config = BertConfig(vocab_size=30500)
model = BertForMaskedLM(config=bert_config)

In [46]:


# Define Trainer
args = TrainingArguments(
    output_dir="/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/",
    do_train=True,
    evaluation_strategy="epoch",
    num_train_epochs=36,
    per_device_train_batch_size=8,
    logging_strategy="steps",
    logging_steps=1000,
    save_strategy="epoch",
    seed=0,
    local_rank=0,
    deepspeed=ds_config,
    dataloader_num_workers=os.cpu_count(),
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=d,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

[2022-03-30 01:47:31,359] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
--------------------------------------------------------------------------
[[54709,1],0]: A high-performance Open MPI point-to-point messaging module
was unable to find any relevant network interfaces:

Module: OpenFabrics (openib)
  Host: bert-pretraining-vm

Another transport will be used instead, although this may result in
lower performance.

btl_base_warn_component_unused to 0.
--------------------------------------------------------------------------
huggingface/tokenizers: The current process just got forked, after parallelism has already

In [47]:
# Train pre-trained model
trainer.train()

[2022-03-30 01:48:05,666] [INFO] [logging.py:69:log_dist] [Rank 0] DeepSpeed info: version=0.6.0, git-hash=unknown, git-branch=unknown


KeyboardInterrupt: 