In [1]:
#standard imports
import os, time, json, datetime, pytz
from tqdm.auto import tqdm
from typing import List, Union, Dict
from re_sent_splitter import split_into_sentences
from pathlib import Path
import pathlib
import numpy as np
from math import floor

#distributed imports
import torch
from torch.nn.parallel import DistributedDataParallel as DDP, DataParallel
import deepspeed

#tokenizers and datasets
from tokenizers import BertWordPieceTokenizer 
from tokenizers.processors import TemplateProcessing
import tokenizers

#transformer imports
from transformers import BertTokenizer
from transformers import BertForMaskedLM, BertConfig, AdamW
from transformers import pipeline

In [2]:
for d in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(d))

_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)
_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)


In [3]:
!nvidia-smi

Sat Mar 26 04:11:07 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000001:00:00.0 Off |                  Off |
| N/A   40C    P0    67W / 300W |      3MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000002:00:00.0 Off |                  Off |
| N/A   56C    P0    77W / 300W |      3MiB / 32510MiB |      0%      Default |
|       

#### Set Tokenizer and Data paths

In [4]:
tokenizer_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
text_data_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/Text/'
encodings_data_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/Encodings/encodings_0_395390.pt'
working_data =  'combined_4Gb.txt'

files = [f for f in os.listdir(text_data_path) if os.path.isfile(os.path.join(text_data_path, f))]
files

['small_sample_10000.txt',
 'english_docs_ad.txt',
 'english_docs_ac.txt',
 'combined_4Gb.txt',
 'english_docs_aa.txt',
 'english_docs_ab.txt']

#### Instantiate pretrained tokenizer from file

In [5]:
alternative_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

tokenizer = BertWordPieceTokenizer(tokenizer_path, strip_accents=True, lowercase=True)
tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding()
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ("[MASK]", tokenizer.token_to_id("[MASK]"))
    ],
)



#### Load data from local
Data is a 98,000 line file with each line representing one document of length ~12,000 characters from PubMed articles

In [6]:
#load data from disk
def load_data_from_disk(path: str, sample_size:int=None, min_tokens_per_sent: int=4) -> List[str]:
    '''
    Utility data loading function that performs the following operations:
       1. Loads data from disk into a list. Assumes each doc is one line.
       2. Performs sentence splitting on each document.
       3. Removes all sentences with tokens < 4 (default).
       4. Returns a list of sentences 
    '''
    #load data
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]

    #split data into sentences
    sentences = [split_into_sentences(i) for i in tqdm(lines, 'Sentence Splitter')]
    
    #remove all sentences with less than 5 tokens
    all_sentences = []
    for doc in tqdm(sentences, 'Filter Senteces'):
        for sentence in doc:
            if len(sentence.split()) > 4:
                all_sentences.append(sentence)
    print(f'Return a list of {len(all_sentences)} sentences')
    
    return all_sentences

In [7]:
def load_data_seq_512(path: str, sample_size:int=None) -> List[str]:
    with open(path) as f:
        if sample_size:
            lines = [line.strip() for line in f.readlines()[:sample_size]]
        else:
            lines = [line.strip() for line in f.readlines()]
    
    return lines

In [13]:
#results = load_data_from_disk(os.path.join(vm_data, ))
start = time.perf_counter()
results = load_data_seq_512(os.path.join(data_path, file[0]))
end = time.perf_counter() - start
print(round(end, 2))

0.49


In [14]:
len(results)

10000

In [10]:
# segments = []
# temp_holder = []
# def create_512_sequences(
# for sentence in tqdm(results, 'Tokenizing'):
#     tokens = alternative_tokenizer.encode(sentence)
#     if len(temp_holder) + len(tokens) < 512:
#         temp_holder.extend(tokens[1:-1])
#     else:
#         temp_holder.insert(0,2)
#         segments.append(temp_holder + [3])
#         temp_holder = []
        

#### Batch encode a chunk of data

In [15]:
s = time.perf_counter()
batch = tokenizer.encode_batch(results)
e = time.perf_counter() - s
print(round(e,2), 'seconds')


7.72 seconds


In [16]:
#decrease load on memory
#del results

#### Create pipeline for random masking of 15% of input tokens

In [17]:
def mlm_pipe(batch: List[tokenizers.Encoding], mlm_prob=0.15) -> dict:
    '''
    Given a single instance from a batch of encodings, return masked inputs and associated arrays.
    Converts tokenizer.Encoding into a pytorch tensor.
    '''
    
    labels = torch.tensor([x.ids for x in tqdm(batch, 'Labels')])
    mask = torch.tensor([x.attention_mask for x in tqdm(batch, 'Attention Mask')])
    input_ids = labels.detach().clone()
    
    #default masking prob = 15%, don't mask special tokens 
    rand = torch.rand(input_ids.shape)
    mask_arr = (rand < mlm_prob) * (input_ids > 4)
    for i in tqdm(range(input_ids.shape[0]), 'Masking Words'):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        input_ids[i, selection] = 4
        
    # temp = input_ids.flatten()
    # percent = sum(temp == 4)/sum(labels.flatten() != 4)
    # print(percent)
    encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
    return encodings

#### Load Encodings from disk

In [8]:
#encodings = mlm_pipe(batch)
encodings = torch.load(encodings_data_path)

In [9]:
sum(sum(encodings['input_ids'] == 4)) / sum(sum(encodings['labels'] != 4))

tensor(0.1490)

In [10]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    
    def __getitem__(self, i):
        return {key : tensor[i] for key, tensor in self.encodings.items()}

In [11]:
d = Dataset(encodings)
#del batch


In [12]:
loader = torch.utils.data.DataLoader(d, batch_size=112, pin_memory=True, shuffle=True)
len(loader)

3531

In [13]:
config = BertConfig(vocab_size=30500,num_hidden_layers=12)
model = BertForMaskedLM(config)

In [14]:
device = 'cuda:0' 
if torch.cuda.device_count() > 1:
    model = DataParallel(model)
    model.to(device)

In [15]:
model.device_ids

[0, 1, 2, 3, 4, 5, 6, 7]

In [16]:
!nvidia-smi


Sat Mar 26 04:11:41 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000001:00:00.0 Off |                  Off |
| N/A   40C    P0    67W / 300W |   1854MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000002:00:00.0 Off |                  Off |
| N/A   55C    P0    76W / 300W |      3MiB / 32510MiB |      0%      Default |
|       

In [17]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

def run():
    num_batches = len(loader)
    epochs = 50
    step = 0
    lowest_loss = 10000
    tolerance = 0.01
    
    model.train()
    
    for epoch in range(epochs):
        # for file in files[:2]:
        #     # setup loop with TQDM and dataloader
        #     results = load_data_from_disk(os.path.join(vm_data, file))
        #     batch = tokenizer.encode_batch(results)
        #     del results
        #     encodings = mlm_pipe(batch)
        #     del batch
        #     d = Dataset(encodings)
        #     del encodings
        #     loader = torch.utils.data.DataLoader(d, batch_size=384, pin_memory=True, shuffle=True)

        loop = tqdm(loader, leave=True)
        for batch in loop:
            step += 1
            # initialize calculated gradients (from prev step)
            optimizer.zero_grad()
            # pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            labels=labels)
            # extract loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.sum().backward()
            # update parameters
            optimizer.step()
            # print relevant info to progress barI 
            loop.set_description(f'Epoch {epoch}')
            
            loss_check = floor(num_batches/10)
            checkpoint = floor(num_batches/2)
            
            if step % loss_check == 0:
                print(f'Loss: {loss.sum()}')
            
        model.module.save_pretrained(f'checkpoints/run_4GB_Mar25_1000pm/model-trained-{epoch}-{step}.pt') 
            
        #loop.set_postfix(loss=loss.item())

In [18]:
run()

  0%|          | 0/3531 [00:00<?, ?it/s]



Loss: 9.073324203491211
Loss: 8.733743667602539
Loss: 8.598007202148438
Loss: 8.500348091125488
Loss: 8.259737968444824
Loss: 8.081645965576172
Loss: 7.992838382720947
Loss: 8.103448867797852
Loss: 7.586031913757324
Loss: 6.755363464355469


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 6.1926589012146
Loss: 5.519335746765137
Loss: 5.149563789367676
Loss: 4.832999229431152
Loss: 4.671858310699463
Loss: 4.3307294845581055
Loss: 4.0236992835998535
Loss: 3.9969537258148193
Loss: 3.7827701568603516
Loss: 3.540799617767334


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 3.42087459564209
Loss: 3.4510416984558105
Loss: 3.3610424995422363
Loss: 3.31130051612854
Loss: 3.358001947402954
Loss: 3.260727882385254
Loss: 3.1341209411621094
Loss: 3.2493226528167725
Loss: 3.076169013977051
Loss: 3.1103620529174805


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 2.8777055740356445
Loss: 2.9155995845794678
Loss: 3.0150604248046875
Loss: 2.802856206893921
Loss: 2.9506590366363525
Loss: 2.888399600982666
Loss: 2.8464808464050293
Loss: 2.8243842124938965
Loss: 2.799969434738159
Loss: 2.7896785736083984


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 2.674084186553955
Loss: 2.6321210861206055
Loss: 2.7832517623901367
Loss: 2.648716449737549
Loss: 2.5958380699157715
Loss: 2.5578980445861816
Loss: 2.612293243408203
Loss: 2.604933977127075
Loss: 2.6298789978027344
Loss: 2.562743663787842


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 2.4964590072631836
Loss: 2.519127368927002
Loss: 2.463134765625
Loss: 2.3647708892822266
Loss: 2.466353178024292
Loss: 2.454939365386963
Loss: 2.4895360469818115
Loss: 2.3631343841552734
Loss: 2.383009910583496
Loss: 2.4516940116882324


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 2.341940402984619
Loss: 2.398685932159424
Loss: 2.374462366104126
Loss: 2.2744500637054443
Loss: 2.340658664703369
Loss: 2.392343044281006
Loss: 2.341154098510742
Loss: 2.3502023220062256
Loss: 2.2401556968688965
Loss: 2.4101977348327637


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 2.232128620147705
Loss: 2.2581076622009277
Loss: 2.121626138687134
Loss: 2.3205044269561768
Loss: 2.25950288772583
Loss: 2.2426810264587402
Loss: 2.282215118408203
Loss: 2.2590408325195312
Loss: 2.227525234222412
Loss: 2.2388219833374023


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 2.166822910308838
Loss: 2.127472400665283
Loss: 2.0675835609436035
Loss: 2.073528289794922
Loss: 2.1895177364349365
Loss: 2.13639497756958
Loss: 2.1755905151367188
Loss: 2.10386061668396
Loss: 2.1725924015045166
Loss: 2.134495735168457


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 2.0913610458374023
Loss: 1.9862961769104004
Loss: 2.0380358695983887
Loss: 2.0738883018493652
Loss: 2.0535573959350586
Loss: 2.035210609436035
Loss: 2.0088183879852295
Loss: 2.1173152923583984
Loss: 2.055591106414795
Loss: 2.090090036392212


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.9205766916275024
Loss: 1.8831324577331543
Loss: 2.0016884803771973
Loss: 1.9727044105529785
Loss: 1.9709312915802002
Loss: 1.9382593631744385
Loss: 1.9573805332183838
Loss: 1.945777177810669
Loss: 2.1248462200164795
Loss: 2.0124595165252686


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.8893709182739258
Loss: 1.8925068378448486
Loss: 1.9285356998443604
Loss: 1.9657697677612305
Loss: 1.8860622644424438
Loss: 1.890089750289917
Loss: 1.8555983304977417
Loss: 1.9392744302749634
Loss: 2.013216733932495
Loss: 1.9524872303009033


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.7949687242507935
Loss: 1.8288542032241821
Loss: 1.914140224456787
Loss: 1.9128665924072266
Loss: 1.8424830436706543
Loss: 1.855072021484375
Loss: 2.0316755771636963
Loss: 1.8875417709350586
Loss: 1.8139156103134155
Loss: 1.8863837718963623


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.778273582458496
Loss: 1.7716835737228394
Loss: 1.8085905313491821
Loss: 1.8155686855316162
Loss: 1.833611011505127
Loss: 1.8564906120300293
Loss: 1.8313688039779663
Loss: 1.8647329807281494
Loss: 1.8676187992095947
Loss: 1.875718355178833


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.6393511295318604
Loss: 1.6845227479934692
Loss: 1.7729411125183105
Loss: 1.7545100450515747
Loss: 1.842043399810791
Loss: 1.7828893661499023
Loss: 1.807100534439087
Loss: 1.7937674522399902
Loss: 1.8165805339813232
Loss: 1.788452386856079


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.611527919769287
Loss: 1.6836025714874268
Loss: 1.6616649627685547
Loss: 1.6664878129959106
Loss: 1.678371787071228
Loss: 1.6317059993743896
Loss: 1.7852702140808105
Loss: 1.7612553834915161
Loss: 1.7241921424865723
Loss: 1.7680954933166504


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.6179929971694946
Loss: 1.6242566108703613
Loss: 1.7293473482131958
Loss: 1.5948189496994019
Loss: 1.6286014318466187
Loss: 1.7457890510559082
Loss: 1.6680102348327637
Loss: 1.8103713989257812
Loss: 1.7303638458251953
Loss: 1.6913769245147705


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.571327567100525
Loss: 1.6406667232513428
Loss: 1.5922194719314575
Loss: 1.6698628664016724
Loss: 1.6480906009674072
Loss: 1.757521629333496
Loss: 1.6177732944488525
Loss: 1.6686928272247314
Loss: 1.6199958324432373
Loss: 1.712738275527954


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.5014357566833496
Loss: 1.588231086730957
Loss: 1.6354713439941406
Loss: 1.6255316734313965
Loss: 1.5052427053451538
Loss: 1.6267383098602295
Loss: 1.599442481994629
Loss: 1.6242671012878418
Loss: 1.584991216659546
Loss: 1.640555739402771


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.5263729095458984
Loss: 1.5712683200836182
Loss: 1.5655875205993652
Loss: 1.5478483438491821
Loss: 1.5355360507965088
Loss: 1.5883703231811523
Loss: 1.6170179843902588
Loss: 1.5031996965408325
Loss: 1.5571300983428955
Loss: 1.6782453060150146


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.4451026916503906
Loss: 1.5222151279449463
Loss: 1.4926536083221436
Loss: 1.4744653701782227
Loss: 1.5241048336029053
Loss: 1.5071446895599365
Loss: 1.5825474262237549
Loss: 1.555250644683838
Loss: 1.5118706226348877
Loss: 1.5728669166564941


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.4022725820541382
Loss: 1.3848152160644531
Loss: 1.5074362754821777
Loss: 1.4717872142791748
Loss: 1.4626399278640747
Loss: 1.5455236434936523
Loss: 1.4378373622894287
Loss: 1.491405963897705
Loss: 1.539149522781372
Loss: 1.614062786102295


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.3933844566345215
Loss: 1.3798444271087646
Loss: 1.4734952449798584
Loss: 1.5280817747116089
Loss: 1.5072996616363525
Loss: 1.5143115520477295
Loss: 1.4600484371185303
Loss: 1.602672815322876
Loss: 1.457524061203003
Loss: 1.5011769533157349


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.3895177841186523
Loss: 1.3385181427001953
Loss: 1.4840412139892578
Loss: 1.4749860763549805
Loss: 1.5414146184921265
Loss: 1.4399067163467407
Loss: 1.486703872680664
Loss: 1.5375373363494873
Loss: 1.556780219078064
Loss: 1.5649681091308594


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.3717694282531738
Loss: 1.3807822465896606
Loss: 1.4413783550262451
Loss: 1.48172926902771
Loss: 1.478743076324463
Loss: 1.440948724746704
Loss: 1.4966752529144287
Loss: 1.490187406539917
Loss: 1.5089738368988037
Loss: 1.5168275833129883


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.288806676864624
Loss: 1.3833585977554321
Loss: 1.4018089771270752
Loss: 1.4117186069488525
Loss: 1.3656630516052246
Loss: 1.3667019605636597
Loss: 1.4583852291107178
Loss: 1.4071767330169678
Loss: 1.4655795097351074
Loss: 1.431588888168335


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.2984907627105713
Loss: 1.3579798936843872
Loss: 1.3793426752090454
Loss: 1.4298369884490967
Loss: 1.3553822040557861
Loss: 1.3851230144500732
Loss: 1.5256638526916504
Loss: 1.4466924667358398
Loss: 1.3731142282485962
Loss: 1.4804524183273315


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.3200339078903198
Loss: 1.3258605003356934
Loss: 1.3825438022613525
Loss: 1.3245118856430054
Loss: 1.4039332866668701
Loss: 1.4240598678588867
Loss: 1.4047459363937378
Loss: 1.4422909021377563
Loss: 1.4619202613830566
Loss: 1.4378271102905273


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.198969841003418
Loss: 1.3331431150436401
Loss: 1.3001501560211182
Loss: 1.3124654293060303
Loss: 1.346968412399292
Loss: 1.3871517181396484
Loss: 1.3586418628692627
Loss: 1.349528193473816
Loss: 1.477169394493103
Loss: 1.4011399745941162


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.2586429119110107
Loss: 1.2579805850982666
Loss: 1.3425045013427734
Loss: 1.2826893329620361
Loss: 1.3386163711547852
Loss: 1.382401943206787
Loss: 1.3356181383132935
Loss: 1.37186598777771
Loss: 1.3913521766662598
Loss: 1.4191524982452393


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.1996655464172363
Loss: 1.294020652770996
Loss: 1.2561750411987305
Loss: 1.2840348482131958
Loss: 1.395337462425232
Loss: 1.2934131622314453
Loss: 1.3360191583633423
Loss: 1.393754482269287
Loss: 1.3611252307891846
Loss: 1.4134647846221924


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.16933274269104
Loss: 1.245877981185913
Loss: 1.287936806678772
Loss: 1.336641788482666
Loss: 1.3204429149627686
Loss: 1.3487217426300049
Loss: 1.3597822189331055
Loss: 1.3673274517059326
Loss: 1.340757966041565
Loss: 1.3570289611816406


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.2214900255203247
Loss: 1.2688748836517334
Loss: 1.324378252029419
Loss: 1.2499945163726807
Loss: 1.3282818794250488
Loss: 1.2935651540756226
Loss: 1.3365156650543213
Loss: 1.276237964630127
Loss: 1.3232009410858154
Loss: 1.3237115144729614


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.1893830299377441
Loss: 1.2278039455413818
Loss: 1.214604139328003
Loss: 1.2282532453536987
Loss: 1.2963335514068604
Loss: 1.3355989456176758
Loss: 1.2786861658096313
Loss: 1.2960171699523926
Loss: 1.3697643280029297
Loss: 1.4528422355651855


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.1777124404907227
Loss: 1.2257864475250244
Loss: 1.2208645343780518
Loss: 1.2261106967926025
Loss: 1.1927168369293213
Loss: 1.236515760421753
Loss: 1.3775222301483154
Loss: 1.2557975053787231
Loss: 1.3470258712768555
Loss: 1.2840032577514648


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.1584417819976807
Loss: 1.1697134971618652
Loss: 1.216402530670166
Loss: 1.2556936740875244
Loss: 1.2777111530303955
Loss: 1.2847025394439697
Loss: 1.3144195079803467
Loss: 1.2188503742218018
Loss: 1.2808573246002197
Loss: 1.3221169710159302


  0%|          | 0/3531 [00:00<?, ?it/s]

Loss: 1.1405360698699951
Loss: 1.131739854812622
Loss: 1.1913433074951172
Loss: 1.1170084476470947
Loss: 1.2482719421386719
Loss: 1.2935435771942139
Loss: 1.2357840538024902
Loss: 1.2813379764556885
Loss: 1.1976304054260254
Loss: 1.2979164123535156


  0%|          | 0/3531 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [19]:
!nvidia-smi

Failed to initialize NVML: Driver/library version mismatch


In [20]:
torch.cuda.empty_cache()

In [21]:
!nvidia-smi

Failed to initialize NVML: Driver/library version mismatch


In [94]:
mask = alternative_tokenizer.mask_token

In [106]:
def show_results(text: str):
    config = BertConfig(vocab_size=30500)
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    untrained_pipe = pipeline('fill-mask', model=model, tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'))
    utresult = untrained_pipe(text)
    
    print()
    print("Untrained Results")
    print("*" * 150)
    for result in utresult:
        print(result['sequence'], result['score'])
        
    lm = BertForMaskedLM.from_pretrained('checkpoints/test_save_pretrained/model-trained-14000.pt/')
    trained_pipe = pipeline('fill-mask', model=lm, tokenizer=alternative_tokenizer)

    tresult = trained_pipe(text)
    
    print()
    print("Trained Results")
    print("*" * 150)
    for result in tresult:
        print(result['sequence'], result['score'])
    

In [107]:
show_results(f'{mask} is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves.')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]


Untrained Results
******************************************************************************************************************************************************
it is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.7136348485946655
insulin is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.014016539789736271
amp is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.01275827456265688
cox is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.012082782573997974
notch is a cholinergic enzyme primarily found at postsynaptic neuromuscular junctions, especially in muscles and nerves. 0.01053623203188181

Trained Results
****************************************************************************************************************

In [31]:
with open('../Data/subsets/xaasplit_25K') as f:
    data = [line.strip() for line in f.readlines()]

In [9]:
test = tokenizer.encode_batch(['Introduction Under normal physiological conditions, all cells in the body are exposed chronically to oxidants from both endogenous and exogenous sources;', 'This is another sentence.'])

In [10]:
input_ids = torch.tensor([x.ids for x in test])
mask = torch.tensor([x.attention_mask for x in test])


In [11]:
outputs = new_model(input_ids, mask)

In [15]:
outputs['logits'].shape

torch.Size([2, 25, 30500])

In [96]:
bbut = BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
new_model.save_pretrained('./')

In [18]:
new_model.get_output_embeddings()

Linear(in_features=768, out_features=30500, bias=True)

In [19]:
new_model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30500, 768, padding_idx=0)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [20]:
test = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
test

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [31]:
from transformers import BertModel, BertForMaskedLM, BertForPreTraining

In [29]:
bert = BertModel.from_pretrained('bert-base-uncased')
mlm = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
from deepspeed_pretrain_bert import masking_function

In [9]:
masking_function(text="""
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
""", 
                 tokenizer=alternative_tokenizer, 
                 mask_prob=0.15, 
                 random_replace_prob=0.1, 
                 unmask_replace_prob=0.1,
                 max_length=512)

([None,
  2420,
  8747,
  1641,
  1634,
  2065,
  14042,
  1737,
  22511,
  17,
  4,
  17,
  4,
  4,
  1696,
  1851,
  1897,
  2290,
  3548,
  3949,
  4,
  1831,
  4,
  1633,
  27604,
  30,
  37,
  4,
  1811,
  4,
  18,
  6791,
  41,
  3259,
  18,
  2976,
  11,
  4,
  11,
  1811,
  4,
  18,
  26897,
  41,
  3259,
  18,
  5150,
  11,
  39,
  17,
  1784,
  1687,
  4178,
  2415,
  5993,
  1740,
  3548,
  3949,
  4,
  1831,
  3253,
  1633,
  27604,
  1746,
  1634,
  14042,
  1641,
  43,
  2065,
  6751,
  1702,
  4,
  3792,
  1726,
  1677,
  3609,
  8452,
  12,
  47,
  18,
  49,
  18,
  3548,
  3949,
  43,
  22511,
  1769,
  19560,
  6477,
  4,
  1654,
  16904,
  2728,
  2065,
  1746,
  43,
  22511,
  1769,
  3241,
  16302,
  3965,
  2065,
  13,
  18,
  17,
  1784,
  1687,
  1851,
  4178,
  4,
  5993,
  1740,
  3548,
  4,
  22511,
  1831,
  3253,
  1633,
  27604,
  4,
  1634,
  14042,
  1641,
  43,
  2065,
  4,
  5993,
  6547,
  1656,
  1710,
  14045,
  6794,
  12,
  3762,
  3949,
  43,
  2

In [35]:
def get_unique_identifier(length: int = 8) -> str:
    """Create a unique identifier by choosing `length`
    random characters from list of ascii characters and numbers
    """
    alphabet = string.ascii_lowercase + string.digits
    uuid = "".join(alphabet[ix] for ix in np.random.choice(len(alphabet), length))
    return uuid

def create_experiment_dir(
        checkpoint_dir: pathlib.Path, all_arguments: Dict
) -> pathlib.Path:
    """ Create an experiment directory and save all arguments in it."""
    current_time = datetime.datetime.now(pytz.timezone("US/Pacific"))
    expname = f"bert_pretrain.{current_time.year}.{current_time.month}.{current_time.day}.{current_time.hour}.{current_time.minute}.{current_time.second}.{get_unique_identifier()}"
    exp_dir = checkpoint_dir / expname
    exp_dir.mkdir(exist_ok=False)
    hparams_file = exp_dir / "hparams.json"
    with hparams_file.open("w") as handle:
        json.dump(obj=all_arguments, fp=handle, indent=2)

    # Create the Tensorboard Dir
    tb_dir = exp_dir / "tb_dir"
    tb_dir.mkdir()
    return exp_dir

In [14]:
hparams = {"train_file": '',
        "validation_file": "",
        "mask_prob": 0.15,
        "epoch": 10,
        "batch_size":32,
        "checkpoint_every":1000,
        "learning_rate":1e-5,
        "weight_decay":0.001,
        "gradient_accumulation_steps":1,
        "lr_scheduler_type":'linear',
        "num_warmup_steps":1000,
        "seed":42}

In [21]:
with open('hparams.json', 'w') as f:
    f.write(json.dumps(hparams))

In [None]:
# model.train()

def save_model(path: './', multiple_gpu: bool=True):
    if multiple_gpu:
        torch.save({'epoch': epoch,
                    'model_state_dict': model.module.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss}, 
              f'{path}model_{step}.pt')
    else:
        torch.save({'epoch': epoch, 
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss}, 
              f'{path}model_{step}.pt')

In [22]:
36*72

2592

In [23]:
_/60

43.2