# Building Transformer Model - Pytorch

![Transformer Architecture](./Transformer_Understanding/img/transformerblock.png)

Base on the upper picture, we will make our model functions with Transformer Architecture following the things we have been researching in the "[Transformer Understanding](./Transformer_Understanding/TransformerNeuralNetworks.ipynb)" part.

In [1]:
# import library
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
if torch.cuda.is_available():
    print("CUDA is available. PyTorch is using GPU.")
    print("Number of GPUs available: ", torch.cuda.device_count())
    print("GPU name: ", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. PyTorch is using CPU.")

CUDA is available. PyTorch is using GPU.
Number of GPUs available:  1
GPU name:  NVIDIA GeForce GTX 1650


## Start with these small blocks, functions, ...

- **Token Embedding**

In [4]:
class TokenEmbedding(nn.Module):
    
    def __init__(self, vocab_size, d_model):
        """
        Token Embedding is used for converting a word / token into a embedding numeric vector space.
        
        :param vocab_size: Number of words / token in vocabulary
        :param d_model: The embedding dimension
        
        Example: With 1000 words in vocabulary and our embedding dimension is 512, the Token Embedding layer will be 1000x512
        """
        super(TokenEmbedding, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.embedding_layer = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        """
        :param x: the word or sequence of words
        :return: the numerical representation of the input
        
        Example:
        Input: (Batch_size, Sequence of words) - (30x100)
        Output: (Batch_size, Sequence of words, d_model) - (30x100x512)
        """
        output = self.embedding_layer(x)
        return output

In [5]:
# For Example
vocab_size = 1000
d_model = 512

embedding_layer = TokenEmbedding(vocab_size, d_model)
input_data = torch.randint(0, vocab_size, (30, 100))
embedding_layer(input_data).shape

torch.Size([30, 100, 512])

- **Positional Encoding**

In [6]:
class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model, max_sequence_length, dropout=0):
        """
        Positional Encoding layer for adding positional information to token embeddings.
        
        :param d_model: The embedding dimension.
        :param max_sequence_length: The maximum length of the input sequences.
        :param dropout: Dropout rate.
        """
        super(PositionalEncoding,self).__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        PE = PE.unsqueeze(0)
        return self.dropout(PE)

In [7]:
PE = PositionalEncoding(512,100,0.1)
PE().shape

torch.Size([1, 100, 512])

- **Multi-Head Attention**

In [8]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, num_heads=8):
        """
        Multi-Head Attention
        :param d_model: the embedding dimension
        :param num_heads: the number of heads, default equals 8
        
        # note: The embedding dimension must be divided by the number of heads
        """
        super(MultiHeadAttention,self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        # query, key value
        self.query = nn.Linear(self.head_dim, self.head_dim, bias=False)  # the Query metrix
        self.key = nn.Linear(self.head_dim, self.head_dim, bias=False)  # the Key metrix
        self.value = nn.Linear(self.head_dim, self.head_dim, bias=False)  # the Value metrix
        
        
        self.linear_layer = nn.Linear(d_model, d_model)
    
    def forward(self, query, key, value, mask=None):
        """
        Perform forward pass of the multi-head attention mechanism.

        :param query: Query tensor of shape (batch_size, q_len, d_model)
        :param key: Key tensor of shape (batch_size, k_len, d_model)
        :param value: Value tensor of shape (batch_size, v_len, d_model)
        :param mask: Optional mask tensor of shape (batch_size, 1, 1, k_len)
        
        :return: Output tensor of shape (batch_size, q_len, d_model)

        """
        # Input of size: batch_size x sequence length x embedding dims
        batch_size = key.size(0)
        k_len, q_len, v_len = key.size(1), query.size(1), value.size(1)

        # reshape from (batch_size x seq_len x embed_size) -> (batch_size x seq_len x heads x head)
        # example: from (30x10x512) -> (30x10x8x64)
        key = key.reshape(batch_size, k_len, self.num_heads, self.head_dim)
        query = query.reshape(batch_size, q_len, self.num_heads, self.head_dim)
        value = value.reshape(batch_size, v_len, self.num_heads, self.head_dim)

        key = self.key(key)  # (30x10x8x64)
        query = self.query(query)  # (30x10x8x64)
        value = self.value(value)  # (30x10x8x64)

        # query shape: batch_size x q_len, heads, head, e.g: (30x10x8x64)
        # key shape: batch_size x v_len, heads, head, e.g: (30x10x8x64)
        # product shape should be: batch_size, heads, q_len, v_len, e.g: (30x8x10x10)
        product = torch.einsum("bqhd,bkhd->bhqk", [query, key])

        # if mask (in decoder)
        if mask is not None:
            product = product.masked_fill(mask == 0, float("-1e20")) # -inf for softmax -> 0

        product = product / math.sqrt(self.head_dim)

        scores = F.softmax(product, dim=-1)

        # scores shape: batch_size, heads, q_len, v_len, e.g: (30x8x10x10)
        # value shape: batch_size, v_len, heads, head, e.g: (30x10x8x64)
        # output: batch_size, heads, v_len, head, e.g: (30x10x512)
        output = torch.einsum("nhql,nlhd->nqhd", [scores, value]).reshape(
            batch_size, q_len, self.num_heads * self.head_dim
        )

        output = self.linear_layer(output)  # (30x10x512) -> (30x10x512)
        
        return output

In [9]:
# For Example
d_model = 512
num_heads = 8

mha_layer = MultiHeadAttention(d_model, num_heads)

query = torch.rand(30, 10, d_model)
key = torch.rand(30, 10, d_model)
value = torch.rand(30, 10, d_model)

mha_layer(query, key, value).shape

torch.Size([30, 10, 512])

- **Layer Normalization Block**

In [10]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y  + self.beta
        return out

# Or using nn.LayerNorm(d_model)

In [11]:
# For Example
ln = LayerNormalization((1,2,3))
ln(torch.randn(1,2,3)).shape

torch.Size([1, 2, 3])

- **Positionwise Feed Forward Block**

In [12]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

# feed_forward = nn.Sequential(
#     nn.Linear(d_model, expansion_factor * d_model),  # e.g: 512x(4*512) -> (512, 2048)
#     nn.ReLU(),  # ReLU activation function
#     nn.Linear(d_model * expansion_factor, d_model),  # e.g: 4*512)x512 -> (2048, 512)
# )

In [13]:
# For Example
ff = PositionwiseFeedForward(512, 300)
ff(torch.randn(1,5,512)).shape

torch.Size([1, 5, 512])

- **Copy Block Function**: we can use nn.Sequential but i think we don't need to do that because we don't have any changes in Module Params

In [14]:
def replicate(block, N=6) -> nn.ModuleList:
    """
    Method to replicate the existing block to N set of blocks
    :param block: class inherited from nn.Module, mainly it is the encoder or decoder part of the architecture
    :param N: the number of stack, in the original paper they used 6
    :return: a set of N blocks
    """
    block_stack = nn.ModuleList([copy.deepcopy(block) for _ in range(N)])
    return block_stack

## With those small blocks and functions, let's build these important blocks!

- **Transformer Block** includes: **Multi-Head Attention**, **Add & Norm**, **Feed & Forward** and **Dropout**

In [15]:
class TransformerBlock(nn.Module):

    def __init__(self,
                 d_model=512,
                 num_heads=8,
                 expansion_factor=4,
                 dropout=0.1
                ):
        """
        The Transformer Block used in the encoder and decoder as well

        :param d_model: the embedding dimension
        :param num_heads: the number of heads
        :param expansion_factor: the factor that determines the output dimension of the feed forward layer
        :param dropout: probability dropout (between 0 and 1)
        """
        super(TransformerBlock, self).__init__()

        self.multihead_attention = MultiHeadAttention(d_model,num_heads)
        self.norm = nn.LayerNorm(d_model)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, expansion_factor * d_model), # Ex: (512,1024)
            nn.ReLU(),
            nn.Linear(expansion_factor * d_model, d_model), # Ex: (1024,512)
            # The output shape will be not different from input
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):
        # First come to Multi-Head Attention
        attention = self.multihead_attention(query,key,value,mask)

        # Add & Norm
        # Add
        attention_added = attention + value;
        # Norm 
        attention_norm = self.dropout(self.norm(attention_added))

        # Feed & Forward
        attention_ff = self.feed_forward(attention_norm)

        # Add & Norm again!
        # Add
        attention_ff_added = attention_ff + attention_norm
        # Norm
        attention_ff_norm = self.dropout(self.norm(attention_ff_added))

        return attention_ff_norm
        

- **Encoder** includes Input Pre-processing (**Token Embedding** & **Positional Encoding**) and **Transformer Block** (Encode block in the picture on the top)

In [16]:
class Encoder(nn.Module):

    def __init__(self,
                 max_length_seq,
                 vocab_size,
                 d_model=512,
                 num_blocks=6,
                 expansion_factor=4,
                 num_heads=8,
                 dropout=0.1
                ):
        """
        The Encoder part of the Transformer architecture

        :param max_length_seq: the max length of the sequence
        :param vocab_size: the total size of the vocabulary
        :param d_model: the embedding dimension
        :param num_blocks: the number of blocks (encoders), 6 by default
        :param expansion_factor: the factor that determines the output dimension of the feed forward layer in each encoder
        :param num_heads: the number of heads in each encoder
        :param dropout: probability dropout (between 0 and 1)
        """
        super(Encoder, self).__init__()

        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Token Embedding
        self.token_emb = TokenEmbedding(vocab_size,d_model)
        # Positional Encoding
        self.pos_encode = PositionalEncoding(d_model,max_length_seq)

        # Transformer Blocks
        self.transformer_blocks = replicate(TransformerBlock(d_model,num_heads,expansion_factor,dropout),num_blocks)

    def forward(self,x):
        # Input Pre-processing: Token Embedding + Positional Encoding
        output = self.dropout(self.pos_encode()[:, :x.size(1), :].requires_grad_(False) + self.token_emb(x))

        # Go to Transformer Blocks (Encode)
        for block in self.transformer_blocks:
            output = block(output,output,output)

        return output
                 

In [17]:
# Test
max_length_seq = 100
vocab_size = 10000
d_model = 512
num_blocks = 6
expansion_factor = 4
num_heads = 8
dropout = 0.1

encoder = Encoder(max_length_seq, vocab_size, d_model, num_blocks, expansion_factor, num_heads, dropout)

batch_size = 32
sequence_length = 50
input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length))

encoder_output = encoder(input_ids)
encoder_output.shape

torch.Size([32, 50, 512])

- **Decoder Block** because the architecture of **Decoder Block** has a little difference from Transformer Block (Encoder Block in the picture), we need to build again!

In [18]:
class DecoderBlock(nn.Module):

    def __init__(self,
                 d_model=512,
                 num_heads=8,
                 expansion_factor=4,
                 dropout=0.1
                ):
        """
        The DecoderBlock which will consist of the TransformerBlock used in the encoder, plus a decoder multi-head attention
        :param d_model: the embedding dimension
        :param num_heads: the number of heads
        :param expansion_factor: the factor that determines the output dimension of the feed forward layer
        :param dropout: probability dropout (between 0 and 1)
        """
        super(DecoderBlock, self).__init__()

        # Masked Multi-Head Attention
        self.attention = MultiHeadAttention(d_model,num_heads)

        # Normalization in Add & Norm
        self.norm = nn.LayerNorm(d_model)

        # Dropout
        self.dropout = nn.Dropout(dropout)

        # Transformer Block
        self.transformer_block = TransformerBlock(d_model,num_heads,expansion_factor,dropout)

    def forward(self, query, key, x, mask): # Different from Encoder
        # Masked Multi-Head Attention
        decoder_attention = self.attention(x,x,x, mask)

        # Add & Norm
        # Add
        decoder_attention_added = self.dropout(decoder_attention + x)
        # Norm
        decoder_attention_norm = self.dropout(self.norm(decoder_attention_added))

        # Transformer Block
        decoder_attention_output = self.transformer_block(query, key, decoder_attention_norm)

        return decoder_attention_output

- **Decoder** includes **Output Pre-processing** (**Token Embedding** & **Positional Encoding**), **Decoder Block**

In [19]:
class Decoder(nn.Module):

    def __init__(self,
                 target_vocab_size,
                 max_length_seq,
                 d_model=512,
                 num_blocks=6,
                 expansion_factor=4,
                 num_heads=8,
                 dropout=0.1
                ):
        """
        The Decoder part of the Transformer architecture

        :param target_vocab_size: the size of the target
        :param max_length_seq: the length of the sequence, in other words, the length of the words
        :param d_model: the embedding dimension
        :param num_blocks: the number of blocks (encoders), 6 by default
        :param expansion_factor: the factor that determines the output dimension of the feed forward layer in each decoder
        :param num_heads: the number of heads in each decoder
        :param dropout: probability dropout (between 0 and 1)
        """
        super(Decoder, self).__init__()
        
         # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Token Embedding
        self.token_emb = TokenEmbedding(target_vocab_size,d_model)
        # Positional Encoding
        self.pos_encode = PositionalEncoding(d_model,max_length_seq)

        # Decoder Blocks
        self.decoder_blocks = replicate(DecoderBlock(d_model,num_heads,expansion_factor,dropout), num_blocks)

    def forward(self, x, encoder_output, mask):
        # Output Pre-processing: Token Embedding + Positional Encoding
        output = self.dropout(self.pos_encode()[:, :x.size(1), :].requires_grad_(False) + self.token_emb(x))

        # Go to Transformer Blocks (Encode)
        for block in self.decoder_blocks:
            output = block(encoder_output,encoder_output,output, mask)

        return output

In [20]:
# Test
def make_trg_mask(trg):
    batch_size, trg_len = trg.shape
    # returns the lower triangular part of matrix filled with ones
    trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
        batch_size, 1, trg_len, trg_len
    )
    return trg_mask

max_length_seq = 100
vocab_size = 10000
d_model = 512
num_blocks = 6
expansion_factor = 4
num_heads = 8
dropout = 0.1

decoder = Decoder(vocab_size, max_length_seq, d_model, num_blocks, expansion_factor, num_heads, dropout)

batch_size = 32
sequence_length = 50
input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length))

decoder(input_ids,encoder_output,make_trg_mask(input_ids)).shape

torch.Size([32, 50, 512])

## Finally, The Transformer Architecture is complete!

In [21]:
class Transformer(nn.Module):

    def __init__(self,
                 d_model,
                 vocab_size,
                 target_vocab_size,
                 max_length_seq,
                 num_blocks=6,
                 expansion_factor=4,
                 num_heads=8,
                 dropout=0.1
                ):
        super(Transformer, self).__init__()

        self.target_vocab_size = target_vocab_size

        self.encoder = Encoder(max_length_seq=max_length_seq,
                              vocab_size=vocab_size,
                               d_model=d_model,
                               num_blocks=num_blocks,
                               expansion_factor=expansion_factor,
                               num_heads=num_heads,
                               dropout=dropout)

        self.decoder = Decoder(target_vocab_size=target_vocab_size,
                              max_length_seq=max_length_seq,
                              d_model=d_model,
                              num_blocks=num_blocks,
                              expansion_factor=expansion_factor,
                              num_heads=num_heads,
                              dropout=dropout)

        self.linear_layer = nn.Linear(d_model, target_vocab_size)

    def make_trg_mask(self, trg):
        batch_size, trg_len = trg.shape
        # returns the lower triangular part of matrix filled with ones
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            batch_size, 1, trg_len, trg_len
        )
        return trg_mask

    def forward(self, source, target):
        trg_mask = self.make_trg_mask(target)
        enc_out = self.encoder(source)
        outputs = self.decoder(target, enc_out, trg_mask)
        output = F.softmax(self.linear_layer(outputs), dim=-1)
        return output

In [22]:
# Test
src_vocab_size = 11
target_vocab_size = 11
num_blocks = 6
seq_len = 12

# let 0 be sos token and 1 be eos token
src = torch.tensor([[0, 2, 5, 6, 4, 3, 9, 5, 2, 9, 10, 1],
                    [0, 2, 8, 7, 3, 4, 5, 6, 7, 2, 10, 1]])
target = torch.tensor([[0, 1, 7, 4, 3, 5, 9, 2, 8, 10, 9, 1],
                       [0, 1, 5, 6, 2, 4, 7, 6, 2, 8, 10, 1]])

print(src.shape, target.shape)
model = Transformer(d_model=512,
                    vocab_size=src_vocab_size,
                    target_vocab_size=target_vocab_size,
                    max_length_seq=seq_len,
                    num_blocks=num_blocks,
                    expansion_factor=4,
                    num_heads=8
                   )

print(model)
out = model(src, target)
print(f"Output Shape: {out.shape}")

torch.Size([2, 12]) torch.Size([2, 12])
Transformer(
  (encoder): Encoder(
    (dropout): Dropout(p=0.1, inplace=False)
    (token_emb): TokenEmbedding(
      (embedding_layer): Embedding(11, 512)
    )
    (pos_encode): PositionalEncoding(
      (dropout): Dropout(p=0, inplace=False)
    )
    (transformer_blocks): ModuleList(
      (0-5): 6 x TransformerBlock(
        (multihead_attention): MultiHeadAttention(
          (query): Linear(in_features=64, out_features=64, bias=False)
          (key): Linear(in_features=64, out_features=64, bias=False)
          (value): Linear(in_features=64, out_features=64, bias=False)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        