# Building and Training an LLM from Scratch

## import packages

In [1]:
# Imports
import re
import math
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import *

## Load data

In [2]:
# Load text data
text = open('text.txt', 'r').read()

print(text)

'Hello, how are you? I'm Adriana.\n'
'Hello, Adriana, my name is Henrique. Nice to meet you too. How are you today?\n'
'Great. My soccer team won the competition.\n'
'Wow, congratulations Henrique!\n'
'Thanks Adriana.\n'
'Shall we go out for pizza later to celebrate?\n'
'Sure. Do you recommend any restaurants Adriana?\n'
'Yes, a new restaurant has opened and they say the banana pizza is phenomenal.\n'
'Ok. Let's meet at the restaurant at seven tonight, okay?\n'
'Sure. See you later then.'


## Preprocessing of Text Data and Vocabulary Construction

In [3]:
# We filter special characters: '.', ',', '?', '!'
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n')
print(sentences)

["'hello how are you i'm adriana\\n'", "'hello adriana my name is henrique nice to meet you too how are you today\\n'", "'great my soccer team won the competition\\n'", "'wow congratulations henrique\\n'", "'thanks adriana\\n'", "'shall we go out for pizza later to celebrate\\n'", "'sure do you recommend any restaurants adriana\\n'", "'yes a new restaurant has opened and they say the banana pizza is phenomenal\\n'", "'ok let's meet at the restaurant at seven tonight okay\\n'", "'sure see you later then'"]


In [4]:
# We divide the sentences into words and create a word list
word_list = list(set(" ".join(sentences).split()))
print(word_list)

['nice', 'do', "'ok", "then'", "competition\\n'", 'to', 'opened', 'we', 'new', "adriana\\n'", 'later', "'thanks", 'the', 'meet', "celebrate\\n'", "'shall", 'at', 'henrique', 'congratulations', 'recommend', 'restaurant', 'out', 'adriana', 'team', 'and', 'for', 'pizza', 'say', 'banana', "let's", 'any', 'see', 'name', "phenomenal\\n'", "'hello", 'my', 'seven', 'they', 'soccer', "henrique\\n'", "today\\n'", 'are', 'how', "i'm", "'great", "'wow", 'a', 'won', "'yes", "'sure", "okay\\n'", 'you', 'tonight', 'restaurants', 'has', 'go', 'is', 'too']


In [5]:
# Initialize the word dictionary with BERT's special tokens
word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
print(word_dict)

{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}


In [6]:
# Add the words to the dictionary and create indexes
for i, w in enumerate(word_list):
    word_dict[w] = i + 4

print(word_dict)

{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3, 'nice': 4, 'do': 5, "'ok": 6, "then'": 7, "competition\\n'": 8, 'to': 9, 'opened': 10, 'we': 11, 'new': 12, "adriana\\n'": 13, 'later': 14, "'thanks": 15, 'the': 16, 'meet': 17, "celebrate\\n'": 18, "'shall": 19, 'at': 20, 'henrique': 21, 'congratulations': 22, 'recommend': 23, 'restaurant': 24, 'out': 25, 'adriana': 26, 'team': 27, 'and': 28, 'for': 29, 'pizza': 30, 'say': 31, 'banana': 32, "let's": 33, 'any': 34, 'see': 35, 'name': 36, "phenomenal\\n'": 37, "'hello": 38, 'my': 39, 'seven': 40, 'they': 41, 'soccer': 42, "henrique\\n'": 43, "today\\n'": 44, 'are': 45, 'how': 46, "i'm": 47, "'great": 48, "'wow": 49, 'a': 50, 'won': 51, "'yes": 52, "'sure": 53, "okay\\n'": 54, 'you': 55, 'tonight': 56, 'restaurants': 57, 'has': 58, 'go': 59, 'is': 60, 'too': 61}


In [8]:
# We invert the order and put the indexes as key and the words as value in the dictionary
number_dict = {i: w for i, w in enumerate(word_dict)}

print(number_dict)

{0: '[PAD]', 1: '[CLS]', 2: '[SEP]', 3: '[MASK]', 4: 'nice', 5: 'do', 6: "'ok", 7: "then'", 8: "competition\\n'", 9: 'to', 10: 'opened', 11: 'we', 12: 'new', 13: "adriana\\n'", 14: 'later', 15: "'thanks", 16: 'the', 17: 'meet', 18: "celebrate\\n'", 19: "'shall", 20: 'at', 21: 'henrique', 22: 'congratulations', 23: 'recommend', 24: 'restaurant', 25: 'out', 26: 'adriana', 27: 'team', 28: 'and', 29: 'for', 30: 'pizza', 31: 'say', 32: 'banana', 33: "let's", 34: 'any', 35: 'see', 36: 'name', 37: "phenomenal\\n'", 38: "'hello", 39: 'my', 40: 'seven', 41: 'they', 42: 'soccer', 43: "henrique\\n'", 44: "today\\n'", 45: 'are', 46: 'how', 47: "i'm", 48: "'great", 49: "'wow", 50: 'a', 51: 'won', 52: "'yes", 53: "'sure", 54: "okay\\n'", 55: 'you', 56: 'tonight', 57: 'restaurants', 58: 'has', 59: 'go', 60: 'is', 61: 'too'}


In [9]:
# Vocabulary size
vocab_size = len(word_dict)
print(vocab_size)

62


In [10]:
# We create a list for the tokens
token_list = list()

# Loop through the sentences to create the list of tokens
for sentence in sentences:
    arr = [word_dict[s] for s in sentence.split()]
    token_list.append(arr)

token_list

[[38, 46, 45, 55, 47, 13],
 [38, 26, 39, 36, 60, 21, 4, 9, 17, 55, 61, 46, 45, 55, 44],
 [48, 39, 42, 27, 51, 16, 8],
 [49, 22, 43],
 [15, 13],
 [19, 11, 59, 25, 29, 30, 14, 9, 18],
 [53, 5, 55, 23, 34, 57, 13],
 [52, 50, 12, 24, 58, 10, 28, 41, 31, 16, 32, 30, 60, 37],
 [6, 33, 17, 20, 16, 24, 20, 40, 56, 54],
 [53, 35, 55, 14, 7]]

In [15]:
# First sentence
text[0:33]

"'Hello, how are you? I'm Adriana."

In [16]:
# First sentence in token format (what will be used to train the BERT model)
token_list[0]

[38, 46, 45, 55, 47, 13]

## Define the Hyperparameters

In [17]:
# Hyperparameters
batch_size = 6
n_segments = 2
dropout = 0.2

# Maximum length
maxlen = 100

# Maximum number of tokens that will be predicted
max_pred = 7

# Number of layers
n_layers = 6

# Number of heads in multi-head attention
n_heads = 12

# Embedding size
d_model = 768

# Feedforward dimension size: 4 * d_model
d_ff = d_model * 4

# Dimension of K(=Q)V
d_k = d_v = 64

# Epochs
NUM_EPOCHS = 50

## Creating Data Batches and Applying Special Tokens

The make_batch() function below creates batches of data for training the BERT model. It is responsible for generating the correct input required for training BERT, which includes the input tokens, the masked tokens, the masked token positions, the segment IDs, and a label indicating whether the second sentence immediately follows the first. Let’s describe each part of the function and use images to make it easier to understand.

**Initialization**: The function starts by initializing an empty batch and counters for positive and negative sentences. Positive sentences are pairs of sentences where the second sentence immediately follows the first, while negative sentences are pairs where it does not. The batch should be balanced between positive and negative sentences.

**Sentence pair generation**: For each instance in the batch, the function randomly selects two sentences from the dataset. Each sentence is then converted to a list of token IDs and the special tokens [CLS] and [SEP] are added in the appropriate places.

**Segment IDs**: For each pair of sentences, the function generates segment IDs, which are 0 for tokens in the first sentence and 1 for tokens in the second sentence.

**Masked Language Model (MLM)**: The function then randomly selects 15% of the tokens to mask for the MLM task, ensuring that the [CLS] and [SEP] tokens are not masked. These tokens are either replaced with the [MASK] token, a random token, or left unchanged, depending on a random draw.

<!-- ![ERROR](images/bert1.png) -->
<img src="images/bert1.png" width="800px">


**Padding**: The function adds padding to the input IDs, segment IDs, masked tokens, and masked positions to ensure that all lists are the same length.

**Next Sentence Prediction**: Finally, the function checks whether the second sentence immediately follows the first. If so, it adds a True label to the instance and increments the positives counter. If not, it adds a False label and increments the negative count.

<!-- ![ERROR](images/bert2.png) -->
<img src="images/bert2.png" width="800px">

This function continues generating instances until the batch is full and contains an equal amount of positive and negative instances. Then, the batch is returned.

Note that this function is just an example of how data can be prepared for BERT training. Depending on the dataset and the specific task, it may be necessary to adjust this function.


The main technical innovation of BERT is that it applies bidirectional training of the Transformer, a popular attention model, to language modeling. This contrasts with previous efforts that analyzed a text sequence from left to right or combined left-to-right and right-to-left training. The results of the paper show that a language model that is trained bidirectionally can have a deeper sense of context and language flow than single-direction language models.

In the paper, the researchers detail a new technique called Masked LM (MLM), which enables bidirectional training in models where it was previously impossible.

Link to the BERT paper: https://arxiv.org/abs/1810.04805

In [18]:
# Defines the function to create batches of data
def make_batch():
    
    # Initialize the batch as an empty list
    batch = []
    
    # Initialize counters for positive and negative examples
    positive = negative = 0
    
    # Continue until half of the batch is positive examples and half is negative examples
    while positive != batch_size/2 or negative != batch_size/2:
        
        # Choose random indices for two sentences
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        
        # Retrieves the tokens corresponding to the indexes
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
        
        # Prepare input ids by adding special tokens [CLS] and [SEP]
        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
        
        # Set the segment ids to differentiate the two sentences
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
        
        # Calculate the number of predictions to make (15% of tokens)
        n_pred = min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) 
        
        # Identifies candidate positions for masking that are not [CLS] or [SEP]
        cand_maked_pos = [i for i, token in enumerate(input_ids) if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
        
        # Shuffles the candidate positions
        shuffle(cand_maked_pos)
        
        # Initialize lists for masked tokens and their positions
        masked_tokens, masked_pos = [], []
        
        # Mask tokens until you reach the desired number of predictions
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            
            # Random mask
            if random() < 0.8:  
                input_ids[pos] = word_dict['[MASK]'] 
            
            # Replace with another token 10% of the time (20% of the remaining time)
            elif random() < 0.5:  
                index = randint(0, vocab_size - 1) 
                input_ids[pos] = word_dict[number_dict[index]] 
        
        # Add zero padding to input ids and segment ids to reach maximum length
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)
        
        # Add zero padding to the masked tokens and their positions if necessary
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)
        
        # Add to the batch as a positive example if the sentences are consecutive
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) 
            positive += 1
        
        # Add to the batch as a negative example if the sentences are not consecutive
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) 
            negative += 1
    
    # Returns the complete batch
    return batch

In [19]:
# Function for padding
def get_attn_pad_masked(seq_q, seq_k):
    
    batch_size, len_q = seq_q.size()
    
    batch_size, len_k = seq_k.size()
    
    pad_attn_masked = seq_k.data.eq(0).unsqueeze(1)
    
    return pad_attn_masked.expand(batch_size, len_q, len_k)

The above function creates an attention mask for padding tokens in a sequence.

**Inputs**: The function accepts two sequences, seq_q and seq_k. These are typically the query sequence and the key sequence in an attention operation.

**Size Extraction**: The function extracts the batch size (batch_size) and sequence lengths (len_q and len_k) from the dimensions of the input sequences.

**Mask Creation**: The attention mask is created by checking which elements in seq_k are equal to zero (which indicates a padding token). This produces a boolean array of the same size as seq_k, where True indicates a padding token and False indicates an actual token.

**Adding a dimension**: The dimension is added to the mask using the unsqueeze(1) method, which adds an extra dimension at index 1. This is necessary because the attention mask must have the same dimension as the attention matrices in the Transformer.

**Expanding the mask**: Finally, the mask is expanded to have the same size as the attention matrix, which has dimensions (batch_size, len_q, len_k). The expanded mask is returned by the function.

In short, the function creates a mask that can be used to prevent the model from paying attention to padding tokens when calculating attention. Padding tokens are used to pad sequences so that they are all the same length, but they do not carry any useful information, so it is important to ensure that the model ignores them.


In [20]:
# Create a batch
batch = make_batch()

# Extract the elements from the batch
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

# Ids das sentenças
input_ids

tensor([[ 1,  3,  5, 55, 23, 34, 57,  3,  2, 53,  5, 55, 23, 34,  3, 13,  2,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 53,  5,  3, 23,  3, 57, 13,  2,  6, 33, 17,  3, 16, 24, 20, 40, 56,
         54,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 38, 26, 39, 36, 60, 21,  4,  9, 17,  3, 61, 46, 45, 55,  3,  2,  3,
         

In [21]:
# Input ids of the first sentence
input_ids[0]

tensor([ 1,  3,  5, 55, 23, 34, 57,  3,  2, 53,  5, 55, 23, 34,  3, 13,  2,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [22]:
segment_ids[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0])

In [23]:
masked_tokens[0]

tensor([57, 53, 13,  0,  0,  0,  0])

In [24]:
masked_pos[0]

tensor([14,  1,  7,  0,  0,  0,  0])

In [25]:
isNext[0]

tensor(0)

In [26]:
# Applies the padding function
get_attn_pad_masked(input_ids, input_ids)[0][0], input_ids[0]

(tensor([False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True]),
 tensor([ 1,  3,  5, 55, 23, 34, 57,  3,  2, 53,  5, 55, 23, 34,  3, 13,  2,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  

## Model Building

The image below is a high-level description of the Transformer encoder. The input is a sequence of tokens, which are first embedded into vectors and then processed in the neural network. The output is a sequence of vectors of size H, where each vector corresponds to an input token with the same index.

In technical terms, predicting the output words requires:

- 1- Adding a classification layer on top of the encoder output.
- 2- Multiplying the output vectors by the embedding matrix, transforming them into the vocabulary dimension.
- 3- Calculating the probability of each word in the vocabulary with softmax.

The loss function in the BERT model only considers the prediction of masked values ​​and ignores the prediction of unmasked words. As a consequence, the model converges more slowly than directional models, a characteristic that is compensated by its greater context awareness.

<!-- ![ERROR](images/bert3.png) -->
<img src="images/bert3.png" width="600px">

In the BERT training process, the model is given sentence pairs as input and learns to predict whether the second sentence in the pair is the subsequent sentence in the original document. During training, 50% of the inputs are a pair where the second sentence is the subsequent sentence in the original document, while for the other 50% a random sentence from the corpus is chosen as the second sentence.

To help the model distinguish between the two sentences being trained, the input is processed as follows before being fed to the model:

- 1- A [CLS] token is inserted at the beginning of the first sentence and a [SEP] token is inserted at the end of each sentence.

- 2- A sentence embedding indicating Sentence A or Sentence B is added to each token. Sentence embeddings are similar in concept to token embeddings with a vocabulary of 2.

- 3- A positional embedding is added to each token to indicate its position in the sequence. The concept and implementation of positional embedding are presented in the Transformer article.

In fact, the embedding used to train the model is a combination of several embeddings.

<!-- ![ERROR](images/bert4.png) -->
<img src="images/bert4.png" width="800px">

In [27]:
# GeLu activation function
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

### 1- Embedding Module

The Embedding class below is part of the BERT architecture. Individual Components of the Class:

Initialization (def init(self)): The class constructor initializes the components required for the embeddings.

self.tok_embed: This is the token embedding layer that maps each token to a vector of dimension d_model.

self.pos_embed: This is the position embedding layer that maps the position of a token within a sequence to a vector of dimension d_model.

self.seg_embed: This is the segment embedding layer that maps the token type (0 for the first sentence and 1 for the second sentence) to a vector of dimension d_model.

self.norm: This is the normalization component of the layer that is used to normalize the embedding vectors.

Forward method (def forward(self, x, seg)): The forward method is where the actual embedding happens.

- First, it calculates the position of each token in the sequence.
- Next, it creates a position matrix of the same shape as the input x using pos.unsqueeze(0).expand_as(x).
- Then, it calculates the total embedding as the sum of the token, position, and segment embeddings.
- Finally, it normalizes the embedding using the normalization layer and returns the result.

The combination of these three embeddings allows BERT to take into account both the individual meaning of the tokens and the order in which they appear in the sequence, as well as whether the token belongs to the first or second sentence. This makes BERT embedding very powerful and flexible.

In [28]:
# Embedding class
class Embedding(nn.Module):
    
    
    def __init__(self):
        # Constructor method
        super(Embedding, self).__init__()
        
        # Token embedding
        self.tok_embed = nn.Embedding(vocab_size, d_model)  
        
        # Position embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  
        
        # Segment (token type) embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  
        
        # Layer normalization
        self.norm = nn.LayerNorm(d_model)

    # Forward Method
    def forward(self, x, seg):
        
        seq_len = x.size(1)
        
        pos = torch.arange(seq_len, dtype = torch.long)
        
        # (seq_len,) -> (batch_size, seq_len)
        pos = pos.unsqueeze(0).expand_as(x)  
        
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        
        return self.norm(embedding)

### 2- Scaled Dot Product Attention Module

Below is the implementation of the Scaled Dot-Product Attention mechanism, which is a key part of the Transformer model used in BERT and other natural language processing models.

Here is a line-by-line explanation of the forward method:

**Scores**: The dot product of Q (query matrix) and K (key matrix) is calculated to determine the score for each key-query pair. These scores determine how well each element of the input sequence should be satisfied in producing the output representation for a given element. The score is then scaled by the square root of the dimension of the keys (d_k) to prevent the dot product values ​​from becoming too large in high-dimensional environments.

**Attention Mask**: The attention mask is applied to the scores by filling in the locations where the mask has a value of 1 with a very large negative number (-1e9). This ensures that these locations are given a weight close to zero when softmax is applied.

**Softmax**: The softmax function is applied to the last axis of the scores to obtain the attention weights. This ensures that all the weights are positive and sum to 1, so they can be interpreted as probabilities.

**Context**: The attention weights are then multiplied by the value matrix V (value) to obtain the output of the attention mechanism. Each value is weighted by the amount we should "attend" to that value, as determined by the attention weights.

The method returns the context (the weighted output) and the attention matrix.

In the Transformer model, Normalized Dot Product Attention is used multiple times in each layer, allowing the model to pay attention to different parts of the input when producing each element of the output. This allows the Transformer to effectively handle long-range dependencies between words in the input sequences.

In [29]:
# Defines the class to perform normalized dot product attention
class ScaledDotProductAttention(nn.Module):
    
    # Class initialization method
    def __init__(self):
        
        # Initialize the base class
        super(ScaledDotProductAttention, self).__init__()

    # Forward method to define the forward passage of data
    def forward(self, Q, K, V, attn_mask):
        
        # Compute attention scores as the product of Q and K, and normalize by key size
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)
        
        # Applies attention mask to avoid attention to certain tokens
        scores.masked_fill_(attn_mask, -1e9)
        
        # Apply softmax to obtain normalized attention weights
        attn = nn.Softmax(dim = -1)(scores)
        
        # Multiply the attention weights by V to get the context
        context = torch.matmul(attn, V)
        
        # Returns the context and attention weights
        return context, attn

### 3- Multi-Head Attention Module

Below is the implementation of Multi-Head Attention, which is a key component of the Transformer architecture used in models like BERT. The idea of ​​multi-head attention is to apply normalized dot product attention multiple times in parallel, each with different learned weights. This allows the model to focus on different positions and capture different types of information.

Let's take a line-by-line look at the forward method:

**Initialization**: residual and batch_size are initialized with Q and the size of the first axis of Q, respectively. The residual will be used later for the residual connection path.

**Linear Transformations**: We apply linear transformations to the input data (Q, K, and V) using different weights. These transformations generate multiple "heads" of attention.

**Reshaping**: The outputs of these linear transformations are then reshaped and transposed to have the appropriate shape for normalized dot product attention.

**Attention Mask**: The attention mask is adjusted to match the shape of the attention heads.

**Normalized Dot Product Attention**: Normalized dot product attention is then applied to each of the attention heads.

**Context Reshaping**: The output (context) of each attention head is then reshaped and concatenated.

**Linear Transformation and Normalization**: A linear transformation is applied to the concatenated context, followed by a layer-wise normalization.

**Residual Connection**: The final result is obtained by summing the output of the layer-wise normalization to the residual connection path (original input Q).

Finally, the function returns the normalized output and the attention matrix. Multi-head attention allows the model to consider information from different parts of the input sequence, in different representation subspaces, at the same time, which improves the model's ability to capture multiple features of the text.

In [30]:
# Defines the class to perform multi-head attention
class MultiHeadAttention(nn.Module):
    
    def __init__(self) -> None:
        
        # Initialize the base class
        super(MultiHeadAttention, self).__init__()
        
        # Define the weight matrix for Q queries
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        
        # Define the weight matrix for K keys
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        
        # Define the weight matrix for the V values
        self.W_V = nn.Linear(d_model, d_v * n_heads)

    # Forward method to define the forward passage of data
    def forward(self, Q, K, V, attn_mask):
        
        # Save the input Q for use in the residual and get the batch size
        residual, batch_size = Q, Q.size(0)
        
        # Process Q through W_Q and arrange the result to have [n_heads] in the second dimension
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        
        # Process K through W_K and arrange the result to have [n_heads] in the second dimension
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        
        # Process V through W_V and arrange the result to have [n_heads] in the second dimension
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)
        
        # Adapt attn_mask to be compatible with the dimensions of q_s, k_s, v_s
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
        
        # Compute the scaled attention from the dot product and context for each attention head
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        
        # Rearrange the context to match the attention heads and return to the original format
        context = context.transpose(1,2).contiguous().view(batch_size, -1, n_heads * d_v)
        
        # Applies a linear transformation to the combined context
        output = nn.Linear(n_heads * d_v, d_model)(context)
        
        # Normalize the output layer and add the residual
        return nn.LayerNorm(d_model)(output + residual), attn

In [31]:
# Creates the Embedding object
emb = Embedding()

# Generates the Embeddings
embeds = emb(input_ids, segment_ids)

# Generates the attention mask
attenM = get_attn_pad_masked(input_ids, input_ids)

# Generates MultiHeadAttention
MHA = MultiHeadAttention()(embeds, embeds, embeds, attenM)

# Output
output, A = MHA

A[0][0]

tensor([[0.0585, 0.0581, 0.0577,  ..., 0.0000, 0.0000, 0.0000],
        [0.0744, 0.0675, 0.0640,  ..., 0.0000, 0.0000, 0.0000],
        [0.0478, 0.0491, 0.0376,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0614, 0.0333, 0.0486,  ..., 0.0000, 0.0000, 0.0000],
        [0.0790, 0.0253, 0.0508,  ..., 0.0000, 0.0000, 0.0000],
        [0.0563, 0.0374, 0.0608,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<SelectBackward0>)

### 4- Positional Feedforward Module

This is the implementation of the Positional Feedforward Network (PoswiseFeedForward), which is a component of the Transformer architecture, used in models like BERT.

The Positional Feedforward Network is composed of two linear layers with a GELU (Gaussian Error Linear Unit) activation between them.

Here is the detailed explanation of the forward method:

**First Linear Layer (self.fc1)**: The input x is passed through a linear layer (also called a fully connected layer). This layer has a linear transformation with d_model inputs and d_ff outputs, where d_model is the dimension of the embedding space and d_ff is the dimension of the hidden layer of the feed-forward network. This allows the model to learn non-linear representations.

**GELU Activation**: Next, the GELU activation is applied. The GELU function allows the model to learn more complex and non-linear transformations. It helps to deal with the problem of vanishing gradients, allowing more information to pass through the network.

**Second Linear Layer (self.fc2)**: Finally, the output of the GELU activation is passed through a second linear layer, which transforms the output back to the original d_model dimension. This is done so that the output of this feedforward network can be summed with the original input (residual connection) in the Transformer.

The return of the function is therefore the output of this second linear layer, which has undergone the transformation of the first linear layer, GELU activation, and the second linear layer.

Positional feed-forward networks are an important part of Transformer models, allowing them to learn more complex representations and perform non-linear transformations of the input data.

In [32]:
# Defines the class for the Positional Feed Forward network
class PoswiseFeedForward(nn.Module):
    
    def __init__(self) -> None:
        
        # Initialize the base class
        super(PoswiseFeedForward, self).__init__()
        
        # First linear layer that increases the dimension of the data from d_model to d_ff
        self.fc1 = nn.Linear(d_model, d_ff)
        
        # Second linear layer that reduces the dimension back from d_ff to d_model
        self.fc2 = nn.Linear(d_ff, d_model)

    # Forward method to define the forward passage of data
    def forward(self, x):
        
        # Apply the first linear transformation, followed by the GELU activation function
        # and then the second linear transformation
        return self.fc2(gelu(self.fc1(x)))

### 5- Módulo Encoder Layer

Esta classe define uma Camada de Codificador (EncoderLayer), que é um componente da arquitetura Transformer e também é usado em modelos como BERT. Cada camada de codificador no Transformer contém duas subcamadas: uma camada de Atenção Multi-Cabeças e uma Rede Feed-Forward Posicional.

Aqui está a explicação detalhada do método forward:

**Atenção Multi-Cabeças (self.enc_self_attn)**: A entrada enc_inputs passa por uma camada de Atenção Multi-Cabeças, que é usada para que cada palavra na entrada tenha atenção direcionada a todas as outras palavras. Essa camada também recebe uma máscara (enc_self_attn_mask), que é usada para evitar que o modelo preste atenção a certas palavras (como as de preenchimento). A saída da Atenção Multi-Cabeças é outra sequência de representações vetoriais, com a mesma dimensão da entrada. A matriz de atenção que mostra como cada palavra se atentou a todas as outras também é retornada.

**Rede Feed-Forward Posicional (self.pos_ffn)**: A saída da camada de Atenção Multi-Cabeças passa então por uma Rede Feed-Forward Posicional. Esta é uma rede neural simples que opera independentemente em cada posição da sequência (ou seja, a mesma rede é aplicada a cada posição). Isso permite ao modelo aprender representações mais complexas e realizar transformações não-lineares dos dados.

A função retorna a saída desta camada de codificador, que é a saída da Rede Feed-Forward Posicional, junto com a matriz de atenção. Portanto, a entrada e a saída desta camada do codificador têm a mesma dimensão, o que permite que várias dessas camadas de codificador sejam empilhadas para formar o codificador completo do Transformer.

In [33]:
# Defines the class for the encoder layer
class EncoderLayer(nn.Module):
    
    def __init__(self) -> None:
        
        # Initialize the base class
        super(EncoderLayer, self).__init__()
        
        # Instantiate multi-head attention for encoder self-attention
        self.enc_self_attn = MultiHeadAttention()
        
        # Instantiate the Positional Feed Forward network for use after self-attention
        self.pos_ffn = PoswiseFeedForward()

    # Forward method to define the forward passage of data
    def forward(self, enc_inputs, enc_self_attn_mask):
        
        # Apply self-attention to input data
        enc_inputs, atnn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
        
        # After self-attention, pass the result through the Positional Feed Forward network
        enc_inputs = self.pos_ffn(enc_inputs)
        
        # Returns the encoder output and attention weights
        return enc_inputs, atnn

### 6- Arquitetura Final do LLM (Modelo BERT)

Esta classe define o modelo BERT (Bidirectional Encoder Representations from Transformers), um modelo de linguagem de última geração que usa transformers e atenção bidirecional para entender a semântica das palavras dentro de um contexto.

Vamos analisar em detalhes o método forward:

Embedding (self.embedding): Transforma as entradas (input_ids e segment_ids) em vetores densos (embeddings).

Máscara de Atenção (get_attn_pad_masked): Gera uma máscara de atenção para ignorar os tokens de preenchimento (pad) nas entradas.

Camadas de Codificação (self.layers): Passa a saída do embedding e a máscara de atenção através de várias camadas do codificador. Cada camada de codificador é composta por uma camada de atenção multi-cabeças e uma rede feed-forward posicional.

Pooling (self.activ1(self.fc(output[:, 0]))): Aplica uma camada totalmente conectada e uma ativação tangente hiperbólica à primeira posição (o token de classificação) de cada sequência na saída do codificador. Isso resulta em um vetor de representação de sequência.

Classificador (self.classifier): Uma camada totalmente conectada que gera os logits para a tarefa de classificação de próxima sentença.

Extração de Tokens Mascarados (torch.gather(output, 1, masked_pos)): Selecione os vetores de saída correspondentes aos tokens mascarados.

Transformação dos Tokens Mascarados (self.norm(self.activ2(self.linear(h_masked)))): Aplica uma transformação linear, uma ativação GELU e normalização à saída dos tokens mascarados.

Decoder (self.decoder): Uma camada linear que gera os logits para a tarefa de modelagem de linguagem mascarada. Usa os mesmos pesos que a camada de embedding de tokens para a consistência no espaço de representação. Esta função decoder é usada somente para gerar os logits finais e não é usada no processo de aprendizado do modelo.

O método retorna os logits para a tarefa de modelagem de linguagem mascarada e a tarefa de classificação de próxima sentença. Esses logits podem então ser usados para calcular as perdas para ambas as tarefas durante o treinamento.

In [41]:
# BERT Model
class BERT(nn.Module):

    def __init__(self) -> None:
        
        super(BERT, self).__init__()
        
        # Embedding layer to generate input embeddings for tokens and positions
        self.embedding = Embedding()
        
        # Stack of encoder layers, where n_layers defines the number of encoder layers in the transformer model
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        
        # Fully connected layer for pooling the hidden state output from the first token (CLS token)
        self.fc = nn.Linear(d_model, d_model)
        
        # Activation function (Tanh) for the output of the pooled hidden state
        self.activ1 = nn.Tanh()
        
        # Linear layer and GELU activation for masked language model predictions
        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu
        
        # Layer normalization applied to the masked language model predictions
        self.norm = nn.LayerNorm(d_model)
        
        # Final classification layer to output binary classification logits
        self.classifier = nn.Linear(d_model, 2)
        
        # Link embedding weights between the decoder and the token embedding layer to share weights
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        
        # Decoder layer to predict the vocabulary distribution for each masked token
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight  # Sharing weights
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))  # Adding bias to decoder output

    def forward(self, input_ids, segment_ids, masked_pos):
        
        # Generate embeddings for tokens and segments
        output = self.embedding(input_ids, segment_ids)
        
        # Generate attention mask to avoid attending to padding tokens
        enc_self_attn_mask = get_attn_pad_masked(input_ids, input_ids)
        
        # Pass the embeddings through each encoder layer in the stack
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        
        # Pooling the first token (CLS token) representation for classification
        h_pooled = self.activ1(self.fc(output[:, 0]))
        
        # Output classification logits for the CLS token
        logits_clsf = self.classifier(h_pooled)
        
        # Gather the masked positions for masked language modeling (MLM) predictions
        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1))
        
        # Extract the masked token representations from the output
        h_masked = torch.gather(output, 1, masked_pos)
        
        # Apply linear transformation, activation, and layer normalization
        h_masked = self.norm(self.activ2(self.linear(h_masked)))
        
        # Decoder output to get logits for the vocabulary prediction of masked tokens
        logits_lm = self.decoder(h_masked) + self.decoder_bias
        
        # Return logits for masked language modeling and classification
        return logits_lm, logits_clsf


## LLM Training and Evaluation

In [35]:
# Create the model
model = BERT()

# Error function
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr = 0.001)

batch = make_batch()

input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

Below is a typical one-epoch training loop for a machine learning model. Let’s break it down into steps:

**optimizer.zero_grad()**: Zeroes the gradients of all optimized variables. This is done because gradients in PyTorch are cumulative, meaning that each time we call .backward(), the gradients are added together instead of being replaced. So, we need to clear these cumulative gradients before each optimization step.

**logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)**: Feeds the input data into the model and gets the output of the model. The output is composed of logits_lm and logits_clsf, which are the raw unnormalized results for the language modeling task and the classification task, respectively.

**loss_lm = criterion(logits_lm.transpose(1,2), masked_tokens)**: Computes the loss for the masked language modeling task. criterion is the loss function, logits_lm.transpose(1,2) are the model predictions, and masked_tokens are the ground truth targets.

**loss_lm = (loss_lm.float()).mean()**: Converts the loss to a floating point data type (if it isn't already) and then computes the mean of the loss.

**loss_clsf = criterion(logits_clsf, isNext)**: Computes the loss for the next sentence classification task.

**loss = loss_lm + loss_clsf**: Combines the two losses into a single scalar loss.

**loss.backward()**: Computes the gradients of all optimized variables. The gradients are computed with respect to the loss.

**optimizer.step()**: Updates the model parameters using the calculated gradients.

These steps are repeated for each training epoch. Each epoch is a complete cycle through the training set. Therefore, if NUM_EPOCHS is 10, then the complete training process is executed 10 times.

In [37]:
%%time

# Start the training loop for a defined number of epochs
for epoch in range(NUM_EPOCHS):
    
    # Resets the optimizer's gradients to avoid accumulation of gradients from previous epochs
    optimizer.zero_grad()
    
    # Pass the input data through the model and get the logits for language masking
    # and next sentence classification
    logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
    
    # Compute the loss for the language masking task by comparing the predicted logits
    # with the actual tokens
    loss_lm = criterion(logits_lm.transpose(1,2), masked_tokens)
    
    # Calculate the average loss to normalize
    loss_lm = (loss_lm.float()).mean()
    
    # Compute the loss for the next sentence classification task
    loss_clsf = criterion(logits_clsf, isNext)
    
    # Add the losses of the two tasks to get the total loss
    loss = loss_lm + loss_clsf
    
    # Displays the current epoch and total loss
    print(f'Epoch: {epoch + 1} | Loss {loss:.4f}')
    
    # Perform backpropagation to calculate gradients
    loss.backward()
    
    # Update model parameters based on calculated gradients
    optimizer.step()

Epoch: 1 | Loss 45.8627
Epoch: 2 | Loss 133.2765
Epoch: 3 | Loss 289.1216
Epoch: 4 | Loss 127.0441
Epoch: 5 | Loss 34.0239
Epoch: 6 | Loss 96.2355
Epoch: 7 | Loss 75.0159
Epoch: 8 | Loss 45.9389
Epoch: 9 | Loss 55.3239
Epoch: 10 | Loss 62.1528
Epoch: 11 | Loss 61.9698
Epoch: 12 | Loss 57.7580
Epoch: 13 | Loss 51.7607
Epoch: 14 | Loss 45.6472
Epoch: 15 | Loss 54.8989
Epoch: 16 | Loss 46.7972
Epoch: 17 | Loss 44.6054
Epoch: 18 | Loss 38.1895
Epoch: 19 | Loss 38.3959
Epoch: 20 | Loss 40.2805
Epoch: 21 | Loss 40.1282
Epoch: 22 | Loss 38.2638
Epoch: 23 | Loss 38.9440
Epoch: 24 | Loss 37.3044
Epoch: 25 | Loss 36.6660
Epoch: 26 | Loss 36.0146
Epoch: 27 | Loss 35.3904
Epoch: 28 | Loss 35.2495
Epoch: 29 | Loss 33.0177
Epoch: 30 | Loss 32.5312
Epoch: 31 | Loss 33.3304
Epoch: 32 | Loss 30.2600
Epoch: 33 | Loss 30.6710
Epoch: 34 | Loss 30.4666
Epoch: 35 | Loss 29.3109
Epoch: 36 | Loss 28.9362
Epoch: 37 | Loss 27.4900
Epoch: 38 | Loss 26.6409
Epoch: 39 | Loss 26.0127
Epoch: 40 | Loss 25.4436
Epoch:

## Extracting Predictions from the Trained LLM

In [38]:
# Extract the batch
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))
print(text)
print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])

'Hello, how are you? I'm Adriana.\n'
'Hello, Adriana, my name is Henrique. Nice to meet you too. How are you today?\n'
'Great. My soccer team won the competition.\n'
'Wow, congratulations Henrique!\n'
'Thanks Adriana.\n'
'Shall we go out for pizza later to celebrate?\n'
'Sure. Do you recommend any restaurants Adriana?\n'
'Yes, a new restaurant has opened and they say the banana pizza is phenomenal.\n'
'Ok. Let's meet at the restaurant at seven tonight, okay?\n'
'Sure. See you later then.'
['[CLS]', "'yes", 'a', 'new', 'restaurant', 'has', 'opened', 'and', 'they', 'say', '[MASK]', 'banana', 'pizza', 'is', "phenomenal\\n'", '[SEP]', '[MASK]', '[MASK]', '[MASK]', 'team', 'won', 'the', "competition\\n'", '[SEP]']


In [39]:
# Extract token predictions
logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('List of Real Masked Tokens: ', [pos.item() for pos in masked_tokens[0] if pos.item() != 0])
print('List of Predicted Masked Tokens: ', [pos for pos in logits_lm if pos != 0])

Lista de Masked Tokens Reais:  [39, 16, 42, 48]
Lista de Masked Tokens Previstos:  [np.int64(34), np.int64(34), np.int64(34), np.int64(34), np.int64(34), np.int64(34), np.int64(34)]


In [40]:
# Extract next token predictions
logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext (Real value): ', True if isNext else False)
print('isNext (Predicted value): ', True if logits_clsf else False)

isNext (Real value):  False
isNext (Predicted value):  True


## The end