- https://www.youtube.com/watch?v=R6hcxMMOrPE
- https://github.com/jamescalam/transformers/blob/main/course/training/03_mlm_training.ipynb

In [2]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# inizialize two models the tokenizer and model:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
#we take a text and split every sentence
with open('./clean.txt') as fp:
    text = fp.read().split('\n')
text[:5]

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.',
 'From my great-grandfather, not to have frequented public schools, and to have had good teachers at home, and to know that on such things a man should spend liberally.',
 "From my governor, to be neither of the green nor of the blue party at the games in the Circus, nor a partizan either of the Parmularius or the Scutarius at the gladiators' fights; from him too I learned endurance of labour, and to want little, and to work with my own hands, and not to meddle with other people's affairs, and not to be ready to listen to slander."]

In [5]:
#tokenizer
inputs = tokenizer(text, return_tensors='pt', max_length = 512, truncation = True, padding = 'max_length')
inputs

{'input_ids': tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,  2013,  1996,  ...,     0,     0,     0],
        [  101,  2013,  2026,  ...,     0,     0,     0],
        ...,
        [  101,  3459,  2185,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [6]:
#we need 2 thing to train bert: input_ids and labels
#before we mask our input_ids we need to create a copy to use it as our labels 

inputs['labels'] = inputs.input_ids.detach().clone()
input
#so now we have a copy of the labels in 'labels': tensor

<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x105821730>>

In [7]:
#now we have to create our mask
#each token that is not a special token has 15% to be our mask
rand = torch.rand(inputs.input_ids.shape)
rand.shape
#where 507 is the number of sequence that we have and 512 is the number of token each sequence has 

torch.Size([507, 512])

In [8]:
rand
#are floats between 0 and 1 

tensor([[0.0797, 0.6405, 0.5635,  ..., 0.2143, 0.8775, 0.4861],
        [0.8883, 0.7885, 0.5845,  ..., 0.1055, 0.1883, 0.9770],
        [0.7949, 0.9166, 0.7290,  ..., 0.5169, 0.5044, 0.4071],
        ...,
        [0.2696, 0.1537, 0.9163,  ..., 0.8324, 0.5453, 0.0084],
        [0.8650, 0.1827, 0.2898,  ..., 0.6017, 0.2992, 0.3094],
        [0.8651, 0.0256, 0.1906,  ..., 0.1943, 0.4622, 0.6535]])

In [9]:
#we want to mask everything that has the value under 0.15 and is not a padding token or classifier token or sequence token
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)
mask_arr
#now we have the mask array

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False]])

In [10]:
#now we want to take the indicies of each value that has true label for each sequence
selection = []
for i in range(mask_arr.shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist()
)

selection[:5]

[[9, 11],
 [3, 5, 6, 7, 13],
 [16, 19, 30, 39, 45, 46],
 [3, 10, 12, 21, 26, 33],
 [1, 8, 9, 11, 38, 41, 51, 52, 67, 73, 74, 86, 89]]

In [11]:
#now we want to set the values equal to 103 for each indices we have in each sequence 
for i in range(mask_arr.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

inputs.input_ids

tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,  2013,  1996,  ...,     0,     0,     0],
        [  101,  2013,  2026,  ...,     0,     0,     0],
        ...,
        [  101,  3459,  2185,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,   103,  3288,  ...,     0,     0,     0]])

In [12]:
#we need to process them into a dateloader during training, but first we need to convert them into pytorch objects 
class MeditationsDataset(torch.utils.data.Dataset): #pass the class
    def __init__(self, encodings): #inizialization function
        self.encodings = encodings #assign encodins to attributes 
    #the dataloader expects 2 additional functions for methods
    #the get tems that you can get a dictionary formatted batch of those items
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    #check the lenght of the dataset that it's looking at
    def __len__(self):
        return len(self.encodings.input_ids)

#so this is our class that handle the formatting our data in data objects

In [13]:
#create a new data variable 
dataset = MeditationsDataset(inputs)

In [14]:
#inizialize the dataloader
dataloader = torch.utils.data.DataLoader(dataset, batch_size = 16, shuffle = True)

In [15]:
#now we are ready to actually training 
#first we need to set the training paramters 

#we check if we have a gpu
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [16]:
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [17]:
#we have to activate our model in training mode
model.train()

#we need to inizialize an optimizer - in this case AdamW
from transformers import AdamW
optim = AdamW(model.parameters(), lr=1e-5)



In [19]:
from tqdm import tqdm  # for our progress bar #allows to create a progress bar during training otherwise we just sat down we don't see any updates

epochs = 2 #we don't want to trian that much

for epoch in range(epochs): #set the tain loop
    # setup loop with TQDM and dataloader
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad() #first we want to calculate our gradient
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward() #calculate the loss for every parameter in our model
        # update parameters
        optim.step() #we want to optimize every parameter based on our loss 
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 32/32 [10:03<00:00, 18.87s/it, loss=4.48]
Epoch 1: 100%|██████████| 32/32 [10:03<00:00, 18.87s/it, loss=1.45]
