# Building the Transformer Architecture from Attention-is-all-you-need Paper
<html><img src="C:\Users\THiNKBooK\Downloads\transformer achitectire.png"></html>

## Dependencies 

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

## 1. Defining the basic building blocks: Multi-Head Attention, Position-wise Feed-Forward Networks, Positional Encoding
### 1.1 Multi-head Attention

<html><img src="C:\Users\THiNKBooK\Downloads\Figure_1_Multi_Head_Attention_source_image_created_by_author_653bad32f1.avif"></html>

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        '''
        The class is defined as a subclass of PyTorch's nn.Module.
    
            d_model: Dimensionality of the input.
            num_heads: The number of attention heads to split the input into.
        '''
    
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads AKA subjects tp process
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value
        
        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        '''
        Calculating Attention Scores: attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k). Here, the attention scores are calculated by taking the dot product of queries (Q) and keys (K), and then scaling by the square root of the key dimension (d_k).
        Applying Mask: If a mask is provided, it is applied to the attention scores to mask out specific values.
        Calculating Attention Weights: The attention scores are passed through a softmax function to convert them into probabilities that sum to 1.
        Calculating Output: The final output of the attention is calculated by multiplying the attention weights by the values (V).
        '''
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k) # we need to transpose because usualy we have k.shape(d_model, d_k) and Q.shape(d_model, d_Q)
        
        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)
        
        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
'''
This method reshapes the input x into the shape (batch_size, num_heads, seq_length, d_k). It enables the model to process multiple attention heads concurrently, allowing for parallel computation.
'''
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
'''
After applying attention to each head separately, this method combines the results back into a single tensor of shape (batch_size, seq_length, d_model). This prepares the result for further processing.
'''
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
'''
The forward method is where the actual computation happens:

Apply Linear Transformations: The queries (Q), keys (K), and values (V) are first passed through linear transformations using the weights defined in the initialization.
Split Heads: The transformed Q, K, V are split into multiple heads using the split_heads method.
Apply Scaled Dot-Product Attention: The scaled_dot_product_attention method is called on the split heads.
Combine Heads: The results from each head are combined back into a single tensor using the combine_heads method.
Apply Output Transformation: Finally, the combined tensor is passed through an output linear transformation.
'''
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

### 1.2 Position-wise Feed-Forward Networks
- In summary, the PositionWiseFeedForward class defines a position-wise feed-forward neural network that consists of two linear layers with a ReLU activation function in between. In the context of transformer models, this feed-forward network is applied to each position separately and identically. It helps in transforming the features learned by the attention mechanisms within the transformer, acting as an additional processing step for the attention outputs.

In [21]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        '''
        d_model: Dimensionality of the model's input and output.
        d_ff: Dimensionality of the inner layer in the feed-forward network.
        self.fc1 and self.fc2: Two fully connected (linear) layers with input and output dimensions as defined by d_model and d_ff.
        self.relu: ReLU (Rectified Linear Unit) activation function, which introduces non-linearity between the two linear layers.
        '''
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        '''
        x: The input to the feed-forward network.
        self.fc1(x): The input is first passed through the first linear layer (fc1).
        self.relu(...): The output of fc1 is then passed through a ReLU activation function. ReLU replaces all negative values with zeros, introducing non-linearity into the model.
        self.fc2(...): The activated output is then passed through the second linear layer (fc2), producing the final output.
        '''
        return self.fc2(self.relu(self.fc1(x)))

### 1.3 Positional Encoding

- **Positional Encoding** is used to inject positional information of each token in the input sequence. It leverages sine and cosine functions with different frequencies to compute the positional encodings.
- Unlike CNNs and RNNs, the self-attention mechanism used in Transformers enables parallel computation but lacks inherent word order information. To address this, **positional encoding** is used to provide positional context to the model. Specifically, a position-dependent signal is added to each word embedding in the input sequence. This helps the model incorporate the order of words in the sequence.
- The output of positional encoding has the same dimension as the embedding layer, allowing it to be added directly to the word embeddings. This ensures that both positional information (from positional encoding) and semantic information (from embeddings) are integrated and passed to subsequent layers.#

### Variants of Positional Encoding

There are several approaches to implementing positional encoding, but the original Transformer model uses sine and cosine functions as defined in the following eq


$$\
  PE(pos, 2i) = \sin\left(\frac{pos}{10000^{\frac{2i}{d_{\text{model}}}}}\right)
$$$$$\
  PE(pos, 2i+1) = \cos\left(\frac{pos}{10000^{\frac{2i}{d_{\text{model}}}}}\right)
$$
he benefits of parallel computation.

P
E
p
o
s
+
k
 can be represented as a linear function of 
P
E
p
o
s
..

In [24]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        '''
        - d_model: The dimension of the model's input.
        
        - max_seq_length: The maximum length of the sequence for which positional encodings are pre-computed.
        
        - pe: A tensor filled with zeros, which will be populated with positional encodings.
        
        - position: A tensor containing the position indices for each position in the sequence.
        
        - div_term: A term used to scale the position indices in a specific way.
        
        - The sine function is applied to the even indices and the cosine function to the odd indices of pe.
        
            * They provide unique encodings for each position.
            * They ensure that the encodings for positions close to each other (e.g., 3 and 4) are also close in value, preserving some notion of "relative" position.
            * They are periodic, which helps the model generalize to sequences longer than those it was trained on.
            
        - Finally, pe is registered as a buffer, which means it will be part of the module's state but will not be considered a trainable parameter.
        '''
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

## 2. Building the Encoder Blocks
- The EncoderLayer class defines a single layer of the transformer's encoder. It encapsulates a multi-head self-attention mechanism followed by position-wise feed-forward neural network, with residual connections, layer normalization, and dropout applied as appropriate. These components together allow the encoder to capture complex relationships in the input data and transform them into a useful representation for downstream tasks. Typically, multiple such encoder layers are stacked to form the complete encoder part of a transformer model.

<html><img src="C:\Users\THiNKBooK\Downloads\Figure_2_The_Encoder_part_of_the_transformer_network_Source_image_from_the_original_paper_b0e3ac40fa.avif"></html>

In [29]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        '''
        d_model: The dimensionality of the input.
        num_heads: The number of attention heads in the multi-head attention.
        d_ff: The dimensionality of the inner layer in the position-wise feed-forward network.
        dropout: The dropout rate used for regularization.
        
        Components:
        
        self.self_attn: Multi-head attention mechanism.
        self.feed_forward: Position-wise feed-forward neural network.
        self.norm1 and self.norm2: Layer normalization, applied to smooth the layer's input.
        self.dropout: Dropout layer, used to prevent overfitting by randomly setting some activations to zero during training.
        '''
        
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        '''
        - Self-Attention: The input x is passed through the multi-head self-attention mechanism.
        - Add & Normalize (after Attention): The attention output is added to the original input (residual connection), followed by dropout and normalization using norm1.
        - Feed-Forward Network: The output from the previous step is passed through the position-wise feed-forward network.
        - Add & Normalize (after Feed-Forward): Similar to step 2, the feed-forward output is added to the input of this stage (residual connection), followed by dropout and normalization using norm2.
        - Output: The processed tensor is returned as the output of the encoder layer.
        '''
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

## 3. Building the Decoder Blocks
-The DecoderLayer class defines a single layer of the transformer's decoder. It consists of a multi-head self-attention mechanism, a multi-head cross-attention mechanism (that attends to the encoder's output), a position-wise feed-forward neural network, and the corresponding residual connections, layer normalization, and dropout layers. This combination enables the decoder to generate meaningful outputs based on the encoder's representations, taking into account both the target sequence and the source sequence. As with the encoder, multiple decoder layers are typically stacked to form the complete decoder part of a transformer model.

<html><img src="C:\Users\THiNKBooK\Downloads\Figure_3_The_Decoder_part_of_the_Transformer_network_Souce_Image_from_the_original_paper_b90d9e7f66.avif"></html>

In [33]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        '''
        d_model: The dimensionality of the input.
        num_heads: The number of attention heads in the multi-head attention.
        d_ff: The dimensionality of the inner layer in the feed-forward network.
        dropout: The dropout rate for regularization.
        
        Components:
        
        self.self_attn: Multi-head self-attention mechanism for the target sequence.
        self.cross_attn: Multi-head attention mechanism that attends to the encoder's output.
        self.feed_forward: Position-wise feed-forward neural network.
        self.norm1, self.norm2, self.norm3: Layer normalization components.
        self.dropout: Dropout layer for regularization.
        '''

        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        '''
        x: The input to the decoder layer.
        enc_output: The output from the corresponding encoder (used in the cross-attention step).
        src_mask: Source mask to ignore certain parts of the encoder's output.
        tgt_mask: Target mask to ignore certain parts of the decoder's input.
        
        Processing Steps:
        
        - Self-Attention on Target Sequence: The input x is processed through a self-attention mechanism.
        
        - Add & Normalize (after Self-Attention): The output from self-attention is added to the original x, followed by dropout and normalization using norm1.
        
        - Cross-Attention with Encoder Output: The normalized output from the previous step is processed through a cross-attention mechanism that attends to the encoder's output enc_output.
        
        - Add & Normalize (after Cross-Attention): The output from cross-attention is added to the input of this stage, followed by dropout and normalization using norm2.
        
        - Feed-Forward Network: The output from the previous step is passed through the feed-forward network.
        
        - Add & Normalize (after Feed-Forward): The feed-forward output is added to the input of this stage, followed by dropout and normalization using norm3.
        
        - Output: The processed tensor is returned as the output of the decoder layer.
        '''
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

## 4. Combining the Encoder and Decoder layers to create the complete Transformer network
<html><img src="C:\Users\THiNKBooK\Downloads\transformer achitectire.png"></html>

In [36]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        '''
        src_vocab_size: Source vocabulary size.
        tgt_vocab_size: Target vocabulary size.
        d_model: The dimensionality of the model's embeddings.
        num_heads: Number of attention heads in the multi-head attention mechanism.
        num_layers: Number of layers for both the encoder and the decoder.
        d_ff: Dimensionality of the inner layer in the feed-forward network.
        max_seq_length: Maximum sequence length for positional encoding.
        dropout: Dropout rate for regularization.
        
        And it defines the following components:
        
        - self.encoder_embedding: Embedding layer for the source sequence.
        - self.decoder_embedding: Embedding layer for the target sequence.
        - self.positional_encoding: Positional encoding component.
        - self.encoder_layers: A list of encoder layers.
        - self.decoder_layers: A list of decoder layers.
        - self.fc: Final fully connected (linear) layer mapping to target vocabulary size.
        - self.dropout: Dropout layer.
        '''
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        '''
        - Input Embedding and Positional Encoding: The source and target sequences are first embedded using their respective embedding layers and then added to their positional encodings.
        - Encoder Layers: The source sequence is passed through the encoder layers, with the final encoder output representing the processed source sequence.
        - Decoder Layers: The target sequence and the encoder's output are passed through the decoder layers, resulting in the decoder's output.
        - Final Linear Layer: The decoder's output is mapped to the target vocabulary size using a fully connected (linear) layer.
        '''
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

# Training the PyTorch Transformer Model

In [37]:
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

In [38]:
transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

In [40]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

# for epoch in range(100):
#     optimizer.zero_grad()
#     output = transformer(src_data, tgt_data[:, :-1])
#     loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
#     loss.backward()
#     optimizer.step()
#     print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Transformer(
  (encoder_embedding): Embedding(5000, 512)
  (decoder_embedding): Embedding(5000, 512)
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-5): 6 x DecoderLayer(
 