# Building Transformer Model - Pytorch

![Transformer Architecture](./Transformer_Understanding/img/transformerblock.png)

Base on the upper picture, we will make our model with Transformer Architecture following the things we have been researching in the "[Transformer Understanding](./Transformer_Understanding/TransformerNeuralNetworks.ipynb)" part.

In [1]:
# import library
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
if torch.cuda.is_available():
    print("CUDA is available. PyTorch is using GPU.")
    print("Number of GPUs available: ", torch.cuda.device_count())
    print("GPU name: ", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. PyTorch is using CPU.")

CUDA is available. PyTorch is using GPU.
Number of GPUs available:  1
GPU name:  NVIDIA GeForce GTX 1650


## Start with these small blocks, functions, ...

- **Token Embeddings**

In [4]:
class TokenEmbedding(nn.Module):
    """
    Custom Token Embedding Layer
    """
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size,d_model)

    def forward(self, x):
        """
        Converting tokens to embedding vectors
        :param x: Tensor contain tokens with shape (sequence_length)
        :return: Tensor contain embedding tokens with shape (sequence_length, d_model)
        """
        return self.embedding(x)

In [28]:
# For example
ex_token_emb = TokenEmbedding(1000, 512)
ex_token_emb(torch.tensor([1,3,5]))

tensor([[-1.1881,  1.4106, -0.0508,  ..., -0.0776,  0.0657, -1.6559],
        [ 0.2590, -0.9922,  1.7796,  ..., -0.6304,  0.6049, -1.0139],
        [-1.2311,  1.1663,  0.7043,  ..., -0.4209,  0.2823, -0.1984]],
       grad_fn=<EmbeddingBackward0>)

- **Positional Encodings**

In [6]:
class PositionalEncoding(nn.Module):
    """
    Custom Positional Encoding Layer
    """
    def __init__(self, max_sequence_length, d_model):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        """
        Creating a Tensor contain Positional Encoding
        :return: Tensor contain Positional Encoding with shape (max_sequence_length, d_model)
        """
        even_index = torch.arange(0,self.d_model,2).float()
        odd_index = torch.arange(1,self.d_model,2).float()
        even_denominator = torch.pow(10000,even_index/self.d_model)
        odd_denominator = torch.pow(10000,(odd_index-1)/self.d_model)
        position = torch.arange(0,self.max_sequence_length,1).unsqueeze(1)
        PE = torch.zeros(self.max_sequence_length,self.d_model)
        PE[:,0::2] = torch.sin(position / even_denominator)
        PE[:,1::2] = torch.cos(position / odd_denominator)
        return PE

In [27]:
# For Example
ex_pe = PositionalEncoding(3, 512)
ex_pe()

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
          1.0366e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
          2.0733e-04,  1.0000e+00]])

- **Query-Key-Value Layer**

In [10]:
class QKVLayer(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.linear_layer = nn.Linear(d_model, d_model * 3)

    def forward(self, x):
        batch_size, sequence_length, d_model = x.size()
        qkv = self.linear_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        return q, k, v

In [26]:
# For Example
ex_qkv = QKVLayer(10, 2)
q,k,v = ex_qkv(torch.randn(2,10,10))
q.shape, k.shape, v.shape

(torch.Size([2, 2, 10, 5]),
 torch.Size([2, 2, 10, 5]),
 torch.Size([2, 2, 10, 5]))

- **Multi-Head Attention**

In [19]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = QKVLayer(d_model,num_heads)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self,x, mask=None):
        def ScaledDotProduct(q, k, v, mask=None):
            d_k = q.size()[-1]
            scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
            if mask is not None:
                scaled = scaled.permute(1, 0, 2, 3) + mask
                scaled = scaled.permute(1, 0, 2, 3)
            attention = F.softmax(scaled, dim=-1)
            values = torch.matmul(attention, v)
            return values, attention

        batch_size, sequence_length, d_model = x.size()
        q, k, v = self.qkv_layer.forward(x)
        values, attention = ScaledDotProduct(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out

In [25]:
# For Example
ex_multihead = MultiHeadAttention(10,2)
ex_multihead(torch.randn(2,10,10))

tensor([[[-0.3140, -0.3788, -0.5676, -0.1718,  0.1243,  0.1269, -0.2300,
           0.1349, -0.4820,  0.2326],
         [-0.2262, -0.2845, -0.4488, -0.3127,  0.0116,  0.0524, -0.0241,
           0.1813, -0.4163,  0.0696],
         [-0.2650, -0.3136, -0.4473, -0.2717,  0.0507,  0.0525, -0.0499,
           0.2105, -0.4399,  0.0838],
         [-0.4085, -0.3837, -0.4819, -0.1319,  0.1462,  0.0694, -0.1749,
           0.1999, -0.5573,  0.1795],
         [-0.3296, -0.2547, -0.4988, -0.1962, -0.0279,  0.1040, -0.0832,
           0.0714, -0.4798,  0.0864],
         [-0.3184, -0.2965, -0.4904, -0.2107,  0.0581,  0.0861, -0.1365,
           0.1213, -0.4857,  0.1305],
         [-0.2521, -0.3771, -0.5989, -0.2193,  0.0902,  0.1414, -0.2108,
           0.1242, -0.4361,  0.2406],
         [-0.2354, -0.3520, -0.5022, -0.2717,  0.0781,  0.0773, -0.0992,
           0.2167, -0.4219,  0.1233],
         [-0.4018, -0.1970, -0.5149, -0.1610, -0.0563,  0.0993, -0.1292,
          -0.0429, -0.5624,  0.0882],
 

- **Normalization Layer**

In [21]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out

In [24]:
# For Example
LayerNormalization(parameters_shape=(3, 4))(torch.randn(2, 3, 4))

tensor([[[-0.9756, -1.4953,  0.8961, -0.1627],
         [ 0.5595, -0.1139,  1.2191,  0.5873],
         [ 1.6807,  0.1400, -1.5243, -0.8109]],

        [[-1.9675, -0.3757,  0.2315,  0.0231],
         [-0.4022,  1.2492,  0.9361, -0.5800],
         [ 1.8513,  0.1362,  0.1366, -1.2385]]], grad_fn=<AddBackward0>)

- **Feed Forward Layer**