# Train LLM with MLM

In [1]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
import numpy as np
import time

In [2]:
from torch.cuda.amp import GradScaler #, autocast
from torch import autocast

scaler = GradScaler()

In [3]:
# from transformers import AdamW
from torch.optim import AdamW
from tqdm import tqdm

In [4]:
from llm_funcs import dataset_obj

In [5]:
batch_size=3*8 #24
epochs = 20

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print('Using device:', device)

Using device: cuda


In [7]:
# device = (torch.device('cuda') if torch.cuda.is_available() 
#           else torch.device('cpu'))
# device

In [8]:
# # download
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model     = BertForMaskedLM.from_pretrained('bert-base-uncased')

# load locally pretrained
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased_local')
# model   = BertForMaskedLM.from_pretrained('bert-base-uncased_local')

#load init model (Not trained)
model     = BertForMaskedLM.from_pretrained('bert-base-init_local')

In [9]:
with open('mlm_text.text', 'r') as fp:
    text = fp.read().split('\n')

In [10]:
text[:5]

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.',
 'From my great-grandfather, not to have frequented public schools, and to have had good teachers at home, and to know that on such things a man should spend liberally.',
 "From my governor, to be neither of the green nor of the blue party at the games in the Circus, nor a partizan either of the Parmularius or the Scutarius at the gladiators' fights; from him too I learned endurance of labour, and to want little, and to work with my own hands, and not to meddle with other people's affairs, and not to be ready to listen to slander."]

In [11]:
inputs = tokenizer(text, return_tensors='pt', max_length=512,
                  truncation=True, padding='max_length')

In [12]:
inputs

{'input_ids': tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,  2013,  1996,  ...,     0,     0,     0],
        [  101,  2013,  2026,  ...,     0,     0,     0],
        ...,
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0]])}

In [13]:
inputs.input_ids[0]

tensor([  101,  2013,  2026,  5615,  2310,  7946,  1045,  4342,  2204, 25288,
         1998,  1996,  2231,  1997,  2026, 12178,  1012,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [14]:
inputs.token_type_ids[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [15]:
inputs.attention_mask[0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [16]:
type(inputs.input_ids[0]), type(inputs.input_ids), 

(torch.Tensor, torch.Tensor)

In [17]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs

{'input_ids': tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,  2013,  1996,  ...,     0,     0,     0],
        [  101,  2013,  2026,  ...,     0,     0,     0],
        ...,
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0]]), 'labels': tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,  2013,  1996,  ...,     0,     0,     0],
        [  101,  2013, 

In [18]:
# Special tokens
PAD  = 0
CLS  = 101
SEP  = 102
MASK = 103

In [19]:
rand = torch.rand(inputs.input_ids.shape)
rand.shape

torch.Size([508, 512])

In [20]:
rand = torch.rand(inputs.input_ids.shape)

# select 15%, remove special tokens from mask
mask_arr = ((rand < 0.15)*
            (inputs.input_ids != CLS)*
            (inputs.input_ids != SEP)*
            (inputs.input_ids != PAD))

mask_arr.shape

torch.Size([508, 512])

In [21]:
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [22]:
# index position of true values to be masked --> selection
selection = []

for i in np.arange(mask_arr.shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())

In [23]:
selection[:5]

[[6, 8, 9],
 [9],
 [2, 3, 8, 9, 19, 23, 26, 33, 34, 38, 39, 40, 45],
 [3, 19, 36],
 [16, 21, 22, 29, 36, 37, 44, 52, 53, 56, 68, 87]]

In [24]:
# replace mask token with selection 
for i in np.arange(mask_arr.shape[0]):
    inputs.input_ids[i, selection[i]] = MASK

In [25]:
inputs.input_ids

tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,  2013,  1996,  ...,     0,     0,     0],
        [  101,  2013,   103,  ...,     0,     0,     0],
        ...,
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,   103,  3288,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0]])

In [26]:
# convert dataset to pytorch data object
dataset = dataset_obj(inputs)

In [27]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, # 16
                                        shuffle=True)

### training

In [28]:
model.to(device);
model.train();

In [29]:
optimizer = AdamW(model.parameters(), lr=1e-4); # 1e-5

In [30]:
scaler    = GradScaler()
clip_grad = 0.1

In [31]:
st = time.time()
model.zero_grad()
optimizer.zero_grad()

for epoch in np.arange(epochs):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        model.zero_grad()
   
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Enables autocasting for the forward pass (model + loss)
        with autocast(device_type=device, enabled=True, dtype=torch.float16):
            outputs = model(input_ids, 
                            attention_mask=attention_mask,
                            labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        
        # clip gradient
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_grad)
        scaler.step(optimizer)
        scaler.update()
        
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|████████████████████████████████████████████████████| 22/22 [00:07<00:00,  3.09it/s, loss=3.27]
Epoch 1: 100%|███████████████████████████████████████████████████| 22/22 [00:06<00:00,  3.28it/s, loss=0.782]
Epoch 2: 100%|███████████████████████████████████████████████████| 22/22 [00:06<00:00,  3.27it/s, loss=0.726]
Epoch 3: 100%|████████████████████████████████████████████████████| 22/22 [00:06<00:00,  3.26it/s, loss=1.14]
Epoch 4: 100%|████████████████████████████████████████████████████| 22/22 [00:06<00:00,  3.26it/s, loss=1.07]
Epoch 5: 100%|███████████████████████████████████████████████████| 22/22 [00:06<00:00,  3.26it/s, loss=0.963]
Epoch 6: 100%|█████████████████████████████████████████████████████| 22/22 [00:06<00:00,  3.25it/s, loss=1.6]
Epoch 7: 100%|███████████████████████████████████████████████████| 22/22 [00:06<00:00,  3.25it/s, loss=0.552]
Epoch 8: 100%|████████████████████████████

In [32]:
ed = time.time()
print(f'time: {np.round((ed-st),2)} sec')
# 19.7G GPU, 136.18 sec

time: 136.18 sec
