In [2]:
#standard imports
import torch
import os
from typing import List
import time
from tqdm.auto import tqdm

#tokenizers and datasets
import tokenizers
from tokenizers import BertWordPieceTokenizer 
from tokenizers.processors import TemplateProcessing
from transformers import BertTokenizer
from whole_word_masking import create_masked_lm_predictions

#### Set data paths

In [3]:
vm_tok_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
vm_data = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/text/partials/'
checkpoint_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/'
files = [f for f in os.listdir(vm_data) if os.path.isfile(os.path.join(vm_data, f))]
files


['xactest', 'xac', 'xab', 'xad', 'xaatest', 'xaa', 'xabtest']

#### Load data from file

In [6]:
def load_data_seq_512(path: str, sample_size:int=None) -> List[str]:
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]
    
    return lines

In [17]:
data = load_data_seq_512(os.path.join(vm_data, 'xaatest'))

In [18]:
len(data)

5000

#### Load tokenizer from file

In [19]:
def load_tokenizer_from_file(vocab_path: str) -> BertWordPieceTokenizer:
    tokenizer = BertWordPieceTokenizer(vocab_path, strip_accents=True, lowercase=True)
    tokenizer.enable_truncation(max_length=512)
    tokenizer.enable_padding()
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[MASK]", tokenizer.token_to_id("[MASK]"))
        ],
    )
    return tokenizer

In [20]:
tokenizer = load_tokenizer_from_file('/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt')

#### Batch encode raw data

In [21]:
s = time.perf_counter()
batch = tokenizer.encode_batch(data)
e = time.perf_counter() - s
print(round(e,2), 'seconds')

3.79 seconds


In [31]:
test = batch[0].tokens

In [32]:
test

['[CLS]',
 'introduction',
 'under',
 'normal',
 'physiological',
 'conditions',
 ',',
 'all',
 'cells',
 'in',
 'the',
 'body',
 'are',
 'exposed',
 'chronically',
 'to',
 'oxidants',
 'from',
 'both',
 'endogenous',
 'and',
 'exogenous',
 'sources',
 ';',
 'yet',
 'the',
 'intracellular',
 '“',
 'redox',
 'buffer',
 '”',
 'mechanism',
 'provides',
 'significant',
 'protection',
 'mainly',
 'by',
 'the',
 'antioxidant',
 'network',
 '[',
 '1',
 ']',
 '.',
 'disturbance',
 'in',
 'the',
 'pro',
 '##oxid',
 '##ant',
 '-',
 'antioxidant',
 'balance',
 'in',
 'favor',
 'of',
 'the',
 'former',
 'leads',
 'to',
 'what',
 'is',
 'known',
 'as',
 'oxidative',
 'stress',
 '[',
 '2',
 ']',
 '.',
 'this',
 'oxidative',
 'stress',
 'and',
 'reactive',
 'oxygen',
 'species',
 '(',
 'ros',
 ')',
 'can',
 'cause',
 'damage',
 'to',
 'dna',
 ',',
 'proteins',
 'and',
 'lipids',
 'an',
 'd',
 'end',
 'up',
 'with',
 'an',
 'epidemic',
 'of',
 'non',
 'communicable',
 'chronic',
 'human',
 'diseases',

#### Prep dataset with Masked tokens @ 15%

In [28]:
def mlm_pipe(batch: List[tokenizers.Encoding], mlm_prob=0.15) -> dict:
    '''
    Given a single instance from a batch of encodings, return masked inputs and associated arrays.
    Converts tokenizer.Encoding into a pytorch tensor.
    '''
    
    labels = torch.tensor([x.ids for x in tqdm(batch, 'Labels')])
    mask = torch.tensor([x.attention_mask for x in tqdm(batch, 'Attention Mask')])
    input_ids = labels.detach().clone()
    
    #default masking prob = 15%, don't mask special tokens 
    rand = torch.rand(input_ids.shape)
    mask_arr = (rand < mlm_prob) * (input_ids > 4)
    for i in tqdm(range(input_ids.shape[0]), 'Masking Words'):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        input_ids[i, selection] = 4
        
    # temp = input_ids.flatten()
    # percent = sum(temp == 4)/sum(labels.flatten() != 4)
    # print(percent)
    encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
    return encodings

In [29]:
encodings = mlm_pipe(batch)

Labels:   0%|          | 0/98862 [00:00<?, ?it/s]

Attention Mask:   0%|          | 0/98862 [00:00<?, ?it/s]

Masking Words:   0%|          | 0/98862 [00:00<?, ?it/s]

In [30]:
sum(sum(encodings['input_ids'] == 4)) / sum(sum(encodings['labels'] != 4))

tensor(0.1491)

#### Serialize encodings

In [32]:
torch.save(encodings, './encodings.pt') 

In [3]:
test = torch.load('/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/encodings/encodings_395390_combined4Gb_1.txt.pt')

In [149]:
tokens = test['labels'][0]

In [150]:
#tokens = tokens.detach().cpu().tolist()
tokens = [tokenizer.id_to_token(tok) for tok in tokens]


In [42]:
tokenizer.token_to_id('##ate')
testrun = ['pom','##eg', '##ran','##ate']

In [47]:
vocab = sorted(list(tokenizer.get_vocab().keys()))

In [55]:
0.15 * 512

76.8

In [151]:
output = create_masked_lm_predictions(tokens, 0.15, 80, vocab, rng)

In [134]:
temp = tokens.detach().cpu().numpy()

In [140]:
samples = [tokenizer.id_to_token(tok) for tensor in tokens for tok in tensor]

In [147]:
samples = ' '.join(samples).split('[SEP]')

In [127]:
import numpy as np

def tester(n=100):
    percents = []
    for _ in range(n):
        ids, labels = create_masked_lm_predictions(tokens, 0.20, 80, vocab, rng)
        count = 0
        for label in ids:
            if label == '[MASK]':
                count += 1
        percents.append(count/len(ids))
    return np.mean(percents)

In [148]:
output = 

tensor([[    2,  2765,  2166,  ...,  1970, 26915,     3],
        [    2,    21,  2765,  ...,    16,  1637,     3],
        [    2,  2765,  6500,  ..., 17977,  1672,     3],
        ...,
        [    2,    21,    18,  ...,    18,  1784,     3],
        [    2,  3680,  3337,  ...,    30,  1763,     3],
        [    2,  2765, 20044,  ...,    13,    18,     3]])

In [153]:
tokenizer.encode_batch([output])

TypeError: TextInputSequence must be str