- https://www.youtube.com/watch?v=q9NS5WpfkrU
- https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c

In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

In [3]:
# inizialize two models the tokenizer and model:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

text = ("After Abraham Lincoln won the November 1860 presidential [MASK] on an "
        "anti-slavery platform, an initial seven slave states declared their "
        "secession from the country to form the Confederacy. War broke out in "
        "April 1861 when secessionist forces [MASK] Fort Sumter in South "
        "Carolina, just over a month after Lincoln's inauguration. ")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# we want to tokenize the text
inputs = tokenizer(text, return_tensors='pt')
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [6]:
inputs.input_ids

tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,   103,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,   103,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,  3204,  2044,  5367,  1005,  1055, 17331,
          1012,   102]])

101 = serialize or classifier token 
102 = separate token
103 = mask token

but here we masked already the words, so now we want to run the test again without masking words and let them mask by BERT

In [7]:

text = ("After Abraham Lincoln won the November 1860 presidential election on an "
        "anti-slavery platform, an initial seven slave states declared their "
        "secession from the country to form the Confederacy. War broke out in "
        "April 1861 when secessionist forces attacked Fort Sumter in South "
        "Carolina, just over a month after Lincoln's inauguration. ")# we want to tokenize the text
inputs = tokenizer(text, return_tensors='pt')
inputs.keys()
inputs.input_ids

tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,  3204,  2044,  5367,  1005,  1055, 17331,
          1012,   102]])

In [8]:
#now we can create our target labels 
inputs['labels'] = inputs.input_ids.detach().clone()
inputs

{'input_ids': tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,  3204,  2044,  5367,  1005,  1055, 17331,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([

In [10]:
#now we wanto to mask a random numbers of ids 
rand = torch.rand(inputs.input_ids.shape)
rand.shape
rand
#now they are number between 0 and 1

tensor([[0.0640, 0.9273, 0.7990, 0.2019, 0.4416, 0.7705, 0.3014, 0.7743, 0.1864,
         0.7891, 0.8058, 0.4950, 0.4786, 0.9443, 0.6866, 0.5617, 0.3762, 0.3233,
         0.9271, 0.2694, 0.5437, 0.3370, 0.1466, 0.2732, 0.3592, 0.7236, 0.7103,
         0.1276, 0.3324, 0.7691, 0.5724, 0.7222, 0.7576, 0.7438, 0.2254, 0.9400,
         0.9525, 0.4063, 0.8413, 0.5576, 0.5895, 0.3944, 0.8323, 0.9092, 0.3823,
         0.0206, 0.3786, 0.4398, 0.0536, 0.3325, 0.1558, 0.0403, 0.2798, 0.1299,
         0.0833, 0.6052, 0.3130, 0.9727, 0.8925, 0.2651, 0.7212, 0.0032]])

In [12]:
#we want to select a random 15% of those 
mask_arr = rand < 0.15
mask_arr
#later on the true token are the one that we want to mask

tensor([[ True, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False,  True, False, False, False, False,  True, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False,  True, False, False,  True, False,
         False,  True, False,  True,  True, False, False, False, False, False,
         False,  True]])

In [14]:
#but we don't want to mask the serialize and the separate token
(inputs.input_ids != 101) * (inputs.input_ids != 102)
#so now the last token is not masked, perfect 

tensor([[False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True, False]])

In [17]:
#so we can just use this formula from the beginning
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102)
mask_arr

tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False,  True, False, False, False, False,  True, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False,  True, False, False,  True, False,
         False,  True, False,  True,  True, False, False, False, False, False,
         False, False]])

In [18]:
#now we want to have the index position of all the true value 
mask_arr[0].nonzero().tolist()

[[22], [27], [45], [48], [51], [53], [54]]

In [21]:
#but this way we have a list in a nother list so we can use this function
selection = torch.flatten(mask_arr[0].nonzero()).tolist()
selection

[22, 27, 45, 48, 51, 53, 54]

In [25]:
#now we want to use this selection to select a certain number / specific indices within our input id tensor
inputs.input_ids[0, selection] = 103
inputs.input_ids

tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,   103,  2037, 22965,  2013,  1996,   103,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,   103,  3334,  1999,   103,  3792,
          1010,   103,  2058,   103,   103,  2044,  5367,  1005,  1055, 17331,
          1012,   102]])

In [27]:
#so now we can pass all of this into our model and the model will calculate our loss and logic as we saw before
outputs = model(**inputs)
outputs.keys()
outputs.loss

tensor(0.9852, grad_fn=<NllLossBackward0>)