In [2]:
%load_ext autoreload
%autoreload 2

#standard imports
import torch
import os
from typing import List
import time
from tqdm.auto import tqdm
import numpy as np

#tokenizers and datasets
import tokenizers
from tokenizers import BertWordPieceTokenizer 
from tokenizers.processors import TemplateProcessing
from transformers import BertTokenizer
from whole_word_masking_ids import create_masked_lm_ids

#### Set data paths

In [5]:
vm_tok_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
vm_data = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/text/partials/xaatest'
checkpoint_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/'
files = [f for f in os.listdir(vm_data) if os.path.isfile(os.path.join(vm_data, f)) and f.startswith('combined')]
files = sorted(files)[:2]
files


['combined4Gb_1.txt', 'combined4Gb_2.txt']

#### Load data from file

In [75]:
def load_data_seq_512(path: str, sample_size:int=None) -> List[str]:
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]
    
    return lines

In [94]:
data = load_data_seq_512(os.path.join(vm_data, 'english_docs_aa.txt'))

In [95]:
len(data)

98862

#### Load tokenizer from file

In [7]:
def load_tokenizer_from_file(vocab_path: str) -> BertWordPieceTokenizer:
    tokenizer = BertWordPieceTokenizer(vocab_path, strip_accents=True, lowercase=True)
    tokenizer.enable_truncation(max_length=512)
    tokenizer.enable_padding()
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[MASK]", tokenizer.token_to_id("[MASK]"))
        ],
    )
    return tokenizer

In [8]:
tokenizer = load_tokenizer_from_file('/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt')

In [9]:
def create_source_ids(tokenizer):
    id_list = [tokenizer.token_to_id(word) for word in tokenizer.get_vocab()]
    return id_list

In [11]:
id_list = create_source_ids(tokenizer)

#### Batch encode raw data

In [100]:
s = time.perf_counter()
batch = tokenizer.encode_batch(data)
e = time.perf_counter() - s
print(round(e,2), 'seconds')

75.01 seconds


#### Prep dataset with Masked tokens @ 15%

In [120]:
def mlm_pipe(batch: List[tokenizers.Encoding], source_ids: list, tokenizer, mlm_prob=0.15) -> dict:
    '''
    Given a single instance from a batch of encodings, return masked inputs and associated arrays.
    Converts tokenizer.Encoding into a pytorch tensor.
    '''
    
    labels = torch.tensor([x.ids for x in tqdm(batch, 'Labels')])
    mask = torch.tensor([x.attention_mask for x in tqdm(batch, 'Attention Mask')])
    input_ids = torch.tensor([create_masked_lm_ids(x.ids, source_ids, tokenizer) for x in tqdm(batch, 'Input Ids')])
    
    #default masking prob = 15%, don't mask special tokens 
    
    # rand = torch.rand(input_ids.shape)
    # mask_arr = (rand < mlm_prob) * (input_ids > 4)
    # for i in tqdm(range(input_ids.shape[0]), 'Masking Words'):
    #     selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    #     input_ids[i, selection] = 4
        
   
    encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
    return encodings

In [121]:
encodings = mlm_pipe(batch, id_list, tokenizer)

Labels:   0%|          | 0/98862 [00:00<?, ?it/s]

Attention Mask:   0%|          | 0/98862 [00:00<?, ?it/s]

Input Ids:   0%|          | 0/98862 [00:00<?, ?it/s]

In [122]:
sum(sum(encodings['input_ids'] == 4)) / sum(sum(encodings['labels'] != 4))

tensor(0.1203)

In [123]:
encodings['input_ids'].shape

torch.Size([98862, 512])

#### Serialize encodings

In [32]:
torch.save(encodings, './encodings.pt') 

In [3]:
test = torch.load('/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/encodings/encodings_395390_combined4Gb_1.txt.pt')

In [124]:
def mask_checker(num):
    return list(zip([tokenizer.id_to_token(x) for x in encodings['input_ids'][num].detach().cpu().numpy()], [tokenizer.id_to_token(x) for x in encodings['labels'][num].detach().cpu().numpy()]))

In [126]:
mask_checker(0)

[('[CLS]', '[CLS]'),
 ('introduction', 'introduction'),
 ('under', 'under'),
 ('normal', 'normal'),
 ('physiological', 'physiological'),
 ('conditions', 'conditions'),
 (',', ','),
 ('all', 'all'),
 ('cells', 'cells'),
 ('in', 'in'),
 ('the', 'the'),
 ('[MASK]', 'body'),
 ('are', 'are'),
 ('exposed', 'exposed'),
 ('chronically', 'chronically'),
 ('to', 'to'),
 ('oxidants', 'oxidants'),
 ('from', 'from'),
 ('both', 'both'),
 ('endogenous', 'endogenous'),
 ('and', 'and'),
 ('exogenous', 'exogenous'),
 ('sources', 'sources'),
 ('[MASK]', ';'),
 ('yet', 'yet'),
 ('the', 'the'),
 ('[MASK]', 'intracellular'),
 ('“', '“'),
 ('redox', 'redox'),
 ('buffer', 'buffer'),
 ('”', '”'),
 ('mechanism', 'mechanism'),
 ('provides', 'provides'),
 ('significant', 'significant'),
 ('protection', 'protection'),
 ('[MASK]', 'mainly'),
 ('by', 'by'),
 ('the', 'the'),
 ('antioxidant', 'antioxidant'),
 ('network', 'network'),
 ('[', '['),
 ('1', '1'),
 (']', ']'),
 ('.', '.'),
 ('disturbance', 'disturbance'),
 

In [13]:
file = os.path.join(vm_data, files[0])

In [14]:
file

'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/text/combined4Gb_1.txt'

In [16]:
filename = file.split('/')[-1].split('.')[0]
filename + '.pt'

'combined4Gb_1.pt'