In [1]:
from transformers import BertTokenizer , BertForMaskedLM
import torch
import warnings
warnings.filterwarnings('ignore')

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
#open book and read the text data
with open('clean.txt','r') as f:
    text = f.read().split('\n')

In [4]:
len(text)

507

In [5]:
text[:5]

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.',
 'From my great-grandfather, not to have frequented public schools, and to have had good teachers at home, and to know that on such things a man should spend liberally.',
 "From my governor, to be neither of the green nor of the blue party at the games in the Circus, nor a partizan either of the Parmularius or the Scutarius at the gladiators' fights; from him too I learned endurance of labour, and to want little, and to work with my own hands, and not to meddle with other people's affairs, and not to be ready to listen to slander."]

In [6]:
#tokenizing text with parameters suitable for the used bert version 
# - return_tensors="pt" -->returns tokenized text as pytorch tensors
# - max_length = 512 --> sets the sequence lenght as 512 as the 'bert-base-uncased' model handle inputs up to 512 tokens in length.
# - padding = 'max_length'-->If the sequence is shorter than the required maximum sequence length, padding is added (commonly using a special [PAD] token) to extend the sequence to the required length.
# - truncation = True --> If the sequence is longer than the maximum allowed length, it is truncated to the maximum length to ensure it fits the model's input size requirement.
inputs = tokenizer(text , return_tensors = "pt" , max_length = 512 , padding = 'max_length' ,truncation = True)

In [7]:
inputs

{'input_ids': tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,  2013,  1996,  ...,     0,     0,     0],
        [  101,  2013,  2026,  ...,     0,     0,     0],
        ...,
        [  101,  3459,  2185,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [8]:
#cloning the input ids in order to apply masking without changing the original data 
inputs['labels'] = inputs.input_ids.detach().clone()
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [9]:
'''create random list of floats between [0,1] , (uniform distribution) with same size of input_ids ,
in order to define later the selected 15% tokens to mask'''
rand = torch.rand(inputs.input_ids.shape)
rand

tensor([[0.1394, 0.4252, 0.5837,  ..., 0.4468, 0.0633, 0.8044],
        [0.3702, 0.2954, 0.4539,  ..., 0.0396, 0.9964, 0.6642],
        [0.0957, 0.1077, 0.7318,  ..., 0.7677, 0.8156, 0.7363],
        ...,
        [0.0503, 0.0248, 0.1981,  ..., 0.5050, 0.8080, 0.2994],
        [0.9397, 0.9986, 0.4246,  ..., 0.7890, 0.1979, 0.3807],
        [0.0980, 0.6100, 0.3070,  ..., 0.4779, 0.1429, 0.8417]])

In [10]:
#masking tokens with 15% probability , excepting CLS --> 101 , SEP --> 102 AND PAD -->0
masks = (rand < .15) & (inputs.input_ids != 101)& (inputs.input_ids != 102)& (inputs.input_ids != 0)
masks

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        ...,
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

- each True value represents a token to be masked , now , we get the indices of each True value within each vector

In [11]:
inputs.input_ids.shape

torch.Size([507, 512])

In [12]:
#selecting indices
indices = []
for i in range(inputs.input_ids.shape[0]):
    indices.append(
    torch.flatten(masks[i].nonzero()).tolist()
    )

In [13]:
indices[:5]

[[15],
 [8, 15],
 [1, 8, 14, 26, 29, 30, 32, 34],
 [6, 9, 10, 12, 13, 18, 31],
 [1, 12, 17, 33, 35, 51, 53, 60, 64, 72]]

In [14]:
#replace the input ids of the previous indices with mask id 103
for i in range (inputs.input_ids.shape[0]):
    inputs.input_ids[i, indices[i]] = 103
    
inputs.input_ids

tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,  2013,  1996,  ...,     0,     0,     0],
        [  101,   103,  2026,  ...,     0,     0,     0],
        ...,
        [  101,   103,  2185,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0]])

In [15]:
#converting the inputs into torch dataset for better processing 
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)
    
#initializing the data 
data = MeditationsDataset(inputs)
#intializing data loader which is used to load our data into the model 
data_loader = torch.utils.data.DataLoader(data , batch_size=10 , shuffle = True)

In [16]:
#setting up resources 
#checking wether or not we have accessibility to GPU , if not use CPU 
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [17]:
#make the model work on the available device 
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [18]:
from transformers import AdamW
#activate the training mode for model
model.train()
#setup optimizer
#lr=5e-5: This specifies the learning rate of the optimizer, set to 0.00005 (5e-5). 
optimizer = AdamW(model.parameters(), lr =5e-5 )

In [19]:
#import tqdm for creating progress bars
from tqdm import tqdm

#set the number of training epochs
epochs = 2

#loop over the number of epochs
for epoch in range(epochs):
    #set up the progress bar for tracking the training loop
    loop = tqdm(data_loader, leave=True)
    #iterate over each batch of data
    for batch in loop:
        #clear any previously calculated gradients before performing a backward pass,to avoid accumulating gradients
        optimizer.zero_grad()
        #load 'input_ids' from the batch to the specified device (GPU/CPU)
        input_ids = batch['input_ids'].to(device)
        #load 'attention_mask' from the batch to the specified device
        attention_mask = batch['attention_mask'].to(device)
        #load 'labels' from the batch to the specified device
        labels = batch['labels'].to(device)
        #forward pass: Compute predicted outputs by passing inputs to the model
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        #retrieve the loss from model outputs
        loss = outputs.loss
        #perform backpropagation: compute gradient of the loss with respect to model parameters
        loss.backward()
        #perform a single optimization step (parameter update)
        optimizer.step()
        #update the progress bar with epoch number and current loss
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())


Epoch 0: 100%|█████████████████████████████████████████████████████████████| 51/51 [15:10<00:00, 17.85s/it, loss=0.158]
Epoch 1: 100%|█████████████████████████████████████████████████████████████| 51/51 [14:59<00:00, 17.64s/it, loss=0.134]
