In [27]:
#standard imports
import torch
import os
from typing import List
import time
from tqdm.auto import tqdm

#tokenizers and datasets
import tokenizers
from tokenizers import BertWordPieceTokenizer 
from tokenizers.processors import TemplateProcessing
from transformers import BertTokenizer

#### Set data paths

In [33]:
vm_tok_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
vm_data = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/'
checkpoint_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/'
files = [f for f in os.listdir(vm_data) if os.path.isfile(os.path.join(vm_data, f))]
files


tensor(90.)

#### Load data from file

In [9]:
def load_data_seq_512(path: str, sample_size:int=None) -> List[str]:
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]
    
    return lines

In [10]:
data = load_data_seq_512(os.path.join(vm_data, 'english_docs_aa.txt'))

#### Load tokenizer from file

In [15]:
def load_tokenizer_from_file(vocab_path: str) -> BertWordPieceTokenizer:
    tokenizer = BertWordPieceTokenizer(vocab_path, strip_accents=True, lowercase=True)
    tokenizer.enable_truncation(max_length=512)
    tokenizer.enable_padding()
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[MASK]", tokenizer.token_to_id("[MASK]"))
        ],
    )
    return tokenizer

In [16]:
tokenizer = load_tokenizer_from_file('/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt')

#### Batch encode raw data

In [17]:
s = time.perf_counter()
batch = tokenizer.encode_batch(data)
e = time.perf_counter() - s
print(round(e,2), 'seconds')

58.5 seconds


In [25]:
t

98862

#### Prep dataset with Masked tokens @ 15%

In [28]:
def mlm_pipe(batch: List[tokenizers.Encoding], mlm_prob=0.15) -> dict:
    '''
    Given a single instance from a batch of encodings, return masked inputs and associated arrays.
    Converts tokenizer.Encoding into a pytorch tensor.
    '''
    
    labels = torch.tensor([x.ids for x in tqdm(batch, 'Labels')])
    mask = torch.tensor([x.attention_mask for x in tqdm(batch, 'Attention Mask')])
    input_ids = labels.detach().clone()
    
    #default masking prob = 15%, don't mask special tokens 
    rand = torch.rand(input_ids.shape)
    mask_arr = (rand < mlm_prob) * (input_ids > 4)
    for i in tqdm(range(input_ids.shape[0]), 'Masking Words'):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        input_ids[i, selection] = 4
        
    # temp = input_ids.flatten()
    # percent = sum(temp == 4)/sum(labels.flatten() != 4)
    # print(percent)
    encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
    return encodings

In [29]:
encodings = mlm_pipe(batch)

Labels:   0%|          | 0/98862 [00:00<?, ?it/s]

Attention Mask:   0%|          | 0/98862 [00:00<?, ?it/s]

Masking Words:   0%|          | 0/98862 [00:00<?, ?it/s]

In [30]:
sum(sum(encodings['input_ids'] == 4)) / sum(sum(encodings['labels'] != 4))

tensor(0.1491)

#### Serialize encodings

In [32]:
torch.save(encodings, './encodings.pt') 

In [None]:
torch.load('