# Building Transformer Model - Pytorch

![Transformer Architecture](./Transformer_Understanding/img/transformerblock.png)

Base on the upper picture, we will make our model functions with Transformer Architecture following the things we have been researching in the "[Transformer Understanding](./Transformer_Understanding/TransformerNeuralNetworks.ipynb)" part.

In [1]:
# import library
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
if torch.cuda.is_available():
    print("CUDA is available. PyTorch is using GPU.")
    print("Number of GPUs available: ", torch.cuda.device_count())
    print("GPU name: ", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. PyTorch is using CPU.")

CUDA is available. PyTorch is using GPU.
Number of GPUs available:  1
GPU name:  NVIDIA GeForce GTX 1650


## Start with these small blocks, functions, ...

- **Get Using Device** (Torch requires all tensor  must be in the same device.)

In [4]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

- **Token Embedding**

In [25]:
class TokenEmbedding(nn.Module):
    
    def __init__(self, vocab_size, d_model):
        """
        Token Embedding is used for converting a word / token into a embedding numeric vector space.
        
        :param vocab_size: Number of words / token in vocabulary
        :param d_model: The embedding dimension
        
        Example: With 1000 words in vocabulary and our embedding dimension is 512, the Token Embedding layer will be 1000x512
        """
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        """
        :param x: the word or sequence of words
        :return: the numerical representation of the input
        
        Example:
        Input: (Batch_size, Sequence of words) - (30x100)
        Output: (Batch_size, Sequence of words, d_model) - (30x100x512)
        """
        x = self.embedding_layer(x)
        return x.to(get_device())

# Or just Simple
# token_embedding = nn.Embedding(vocab_size, d_model)

In [26]:
# For Example
vocab_size = 1000
d_model = 512

embedding_layer = TokenEmbedding(vocab_size, d_model)
input_data = torch.randint(0, vocab_size, (30, 100))
embedding_layer(input_data).shape

torch.Size([30, 100, 512])

- **Positional Encoding**

In [27]:
class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model, max_sequence_length, dropout=0.1):
        """
        Positional Encoding layer for adding positional information to token embeddings.
        
        :param d_model: The embedding dimension.
        :param max_sequence_length: The maximum length of the input sequences.
        :param dropout: Dropout rate.
        """
        super(PositionalEncoding,self).__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        PE = PE.unsqueeze(0)
        return self.dropout(PE)

In [28]:
PE = PositionalEncoding(512,100,0.1)
PE().shape

torch.Size([1, 100, 512])

- **Multi-Head Attention**

2 options for: 'encoder' and 'decoder' (**Multi-Head Cross Attention**)

options for mask: **None**, **Self-Attention Mask** (**Causal Mask** hoặc **Look-Ahead Mask**), **Padding Mask**

In [29]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, num_heads=8, cross=False):
        """
        Multi-Head Attention
        :param d_model: the embedding dimension
        :param num_heads: the number of heads, default equals 8
        :param cross: True for Multi-Head Cross Attention, False for Multi-Head Attention only
        
        # note: The embedding dimension must be divided by the number of heads
        """
        super(MultiHeadAttention,self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.cross = cross

        # query, key value layer
        if self.cross: # Multi-Head Cross Attention
            self.kv_layer = nn.Linear(d_model , 2 * d_model)
            self.q_layer = nn.Linear(d_model , d_model)
        else:
            self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        
        
        # method 1: old, cost alot
        # self.query = nn.Linear(self.head_dim, self.head_dim, bias=False)
        # self.key = nn.Linear(self.head_dim, self.head_dim, bias=False)
        # self.value = nn.Linear(self.head_dim, self.head_dim, bias=False) 

        # method 2: the fewer linear layers the better the cost
        
        
        # Linear Layer in Multi-Head Attention
        self.linear_layer = nn.Linear(d_model, d_model)

    def scaled_dot_product(self, q, k, v, mask=None):
        d_k = q.size()[-1]
        scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
        if mask is not None:
            scaled = scaled.permute(1, 0, 2, 3) + mask
            scaled = scaled.permute(1, 0, 2, 3)
        attention = F.softmax(scaled, dim=-1)
        values = torch.matmul(attention, v)
        return values, attention
    
    def forward(self, x, mask=None):
        """
        Perform forward pass of the multi-head attention mechanism.

        :param x: if cross is True then x is a dictionary including  'encoder_output' and 'w'.
        :param mask: Optional mask tensor
        
        :return: Output tensor of shape (batch_size, length_seq, d_model)

        """

        # For MultiHead Cross Attention
        if self.cross:
            encoder_output = x['encoder_output']
            w = x['w']
            batch_size, length_seq, d_model = w.size()
            kv = self.kv_layer(w)
            q = self.q_layer(encoder_output)
            kv = kv.reshape(batch_size, length_seq, self.num_heads, 2 * self.head_dim)
            q = q.reshape(batch_size, length_seq, self.num_heads, self.head_dim)
            kv = kv.permute(0, 2, 1, 3)
            q = q.permute(0, 2, 1, 3)
            k, v = kv.chunk(2, dim=-1)
            values, attention = self.scaled_dot_product(q, k, v, mask) # mask is not required in Cross Attention
            values = values.permute(0, 2, 1, 3).reshape(batch_size, length_seq, self.num_heads * self.head_dim)
            out = self.linear_layer(values)
            return out

        # For MultiHead Attention
        batch_size, length_seq, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, length_seq, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = self.scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, length_seq, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out

In [30]:
# For Example
d_model = 512
num_heads = 8


# Attention
mha_layer = MultiHeadAttention(d_model, num_heads)
mha_layer(torch.randn(1,10,d_model))

tensor([[[-0.0229,  0.3181, -0.0215,  ...,  0.0259,  0.1319, -0.0412],
         [-0.0419,  0.2770,  0.0670,  ...,  0.0602,  0.1198, -0.0771],
         [-0.0457,  0.2849,  0.0391,  ...,  0.0590,  0.0696, -0.0897],
         ...,
         [ 0.0452,  0.2738, -0.0414,  ...,  0.0301,  0.0869, -0.0696],
         [ 0.0180,  0.2401, -0.0527,  ...,  0.0451,  0.0974, -0.0906],
         [ 0.0140,  0.3105, -0.0263,  ...,  0.0064,  0.0948, -0.0760]]],
       grad_fn=<ViewBackward0>)

In [31]:
# Cross Attention
mha_layer = MultiHeadAttention(d_model, num_heads,cross=True)
mha_layer({'encoder_output':torch.randn(1,10,d_model),'w':torch.randn(1,10,d_model)})

tensor([[[-0.1622, -0.1015,  0.0889,  ..., -0.0870,  0.0573, -0.0422],
         [-0.1445, -0.1074,  0.0502,  ..., -0.0585,  0.0648, -0.0502],
         [-0.1760, -0.1166,  0.1122,  ..., -0.0488,  0.0623, -0.0487],
         ...,
         [-0.1120, -0.0916,  0.0743,  ..., -0.0733,  0.0644, -0.0471],
         [-0.1116, -0.0606,  0.0635,  ..., -0.0384,  0.0473, -0.0808],
         [-0.1102, -0.0878,  0.0236,  ..., -0.0136,  0.0609, -0.0803]]],
       grad_fn=<ViewBackward0>)

- **Layer Normalization Block**

In [32]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y  + self.beta
        return out

# Or using nn.LayerNorm(d_model)

In [33]:
# For Example
ln = LayerNormalization((1,2,3))
ln(torch.randn(1,2,3))

tensor([[[ 1.6767, -0.5315, -1.1357],
         [ 0.7328,  0.2627, -1.0050]]], grad_fn=<AddBackward0>)

- **Positionwise Feed Forward Block**

In [34]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

# feed_forward = nn.Sequential(
#     nn.Linear(d_model, expansion_factor * d_model),  # e.g: 512x(4*512) -> (512, 2048)
#     nn.ReLU(),  # ReLU activation function
#     nn.Linear(d_model * expansion_factor, d_model),  # e.g: 4*512)x512 -> (2048, 512)
# )

In [35]:
# For Example
ff = PositionwiseFeedForward(512, 300)
ff(torch.randn(1,5,512)).shape

torch.Size([1, 5, 512])

- **Copy Block Function**: we can use nn.Sequential but i think we don't need to do that because we don't have any changes in Module Params

In [36]:
def replicate(block, N=6) -> nn.ModuleList:
    """
    Method to replicate the existing block to N set of blocks
    :param block: class inherited from nn.Module, mainly it is the encoder or decoder part of the architecture
    :param N: the number of stack, in the original paper they used 6
    :return: a set of N blocks
    """
    block_stack = nn.ModuleList([copy.deepcopy(block) for _ in range(N)])
    return block_stack

## With those small blocks and functions, let's build these important blocks!

- **Preprocessing** for Input Pre-processing and Output Pre-processing

In [37]:
class Preprocessing(nn.Module):

    def __init__(self, max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout=0.1):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.language_to_index = language_to_index
        self.max_length_seq = max_length_seq
        self.start_token = start_token
        self.end_token = end_token
        self.pad_token = pad_token

        # Layer
        self.token_embedding = TokenEmbedding(self.vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_length_seq, dropout)
        self.dropout = nn.Dropout(dropout)

    def tokenize(self, sentence, start_token:bool, end_token:bool):
        encode_char = [self.language_to_index[token] for token in list(sentence)]
        if start_token:
            encode_char.insert(0, self.language_to_index[self.start_token])
        if end_token:
            encode_char.append(self.language_to_index[self.end_token])
        for _ in range(len(encode_char), self.max_length_seq):
            encode_char.append(self.language_to_index[self.pad_token])
        return torch.tensor(encode_char)
    
    def batch_tokens(self, batch, start_token:bool, end_token:bool):
        tokens = []
        for i in range(len(batch)):
            tokens.append(self.tokenize(batch[i], start_token, end_token))
        tokens = torch.stack(tokens)
        return tokens.to(get_device())

    def forward(self, x, start_token:bool, end_token:bool): 
        x = self.batch_tokens(x, start_token, end_token)
        x = self.token_embedding(x)
        pos = self.positional_encoding().to(get_device())
        x = self.dropout(x + pos)
        return x

- **Transformer Block** includes: **Multi-Head Attention**, **Add & Norm**, **Feed & Forward** and **Dropout**

2 options: 'Encoder' and 'Decoder'

In [38]:
class TransformerBlock(nn.Module):

    def __init__(self,
                 d_model=512,
                 num_heads=8,
                 ff_hidden=300,
                 dropout=0.1,
                 options='encoder'
                ):
        """
        The Transformer Block used in the encoder and decoder as well

        :param d_model: the embedding dimension
        :param num_heads: the number of heads
        :param ff_hidden: The output dimension of the feed forward layer
        :param dropout: probability dropout (between 0 and 1)
        :param options: The choice between 'encoder' and 'decoder'
        """
        super(TransformerBlock, self).__init__()
    
        self.options = options
        
        # For both 2 options: encoder and decoder
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm_for_attention = LayerNormalization(parameters_shape=[d_model])
        self.dropout_attention = nn.Dropout(dropout)

        self.ff = PositionwiseFeedForward(d_model=d_model, hidden=ff_hidden, drop_prob=dropout)
        self.norm_for_ff = LayerNormalization(parameters_shape=[d_model])
        self.dropout_for_ff = nn.Dropout(dropout)
        
        # For decoder
        if self.options=='decoder':
            self.cross_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads, cross=True)
            self.norm_for_cross_attention = LayerNormalization(parameters_shape=[d_model])
            self.dropout2 = nn.Dropout(dropout)
        elif self.options!='encoder':
            raise Exception(f"Unknown option {options}")

    def forward(self, x, mask=None):
        # For decoder
        if self.options == 'decoder':
            encoder_output = x['encoder_output']
            w = x['w']
            w_residual = w.clone()
            w = self.attention(w,mask['self_attention_mask'])
            w = self.dropout_attention(w)
            w = self.norm_for_attention(w + w_residual)

            w_residual = w.clone()
            w = self.cross_attention({'encoder_output':encoder_output,'w':w},mask['cross_attention_mask'])
            w = self.dropout2(w)
            w = self.norm_for_cross_attention(w + w_residual)

            w_residual = w.clone()
            w = self.ff(w)
            w = self.dropout_for_ff(w)
            w = self.norm_for_ff(w + w_residual)
            return w
        else:
        # For encoder
            x_residual = x.clone()
            x = self.attention(x, mask)
            x = self.dropout_attention(x)
            x = self.norm_for_attention(x + x_residual)

            x_residual = x.clone()
            x = self.ff(x)
            x = self.dropout_for_ff(x)
            x = self.norm_for_ff(x + x_residual)
            return x

In [39]:
# Test
trans_block = TransformerBlock()
trans_block(torch.randn(1,10,512))

tensor([[[-6.7121e-01, -7.4082e-01, -7.5190e-01,  ...,  2.9882e-01,
           5.3573e-01, -8.3785e-01],
         [-1.4255e-01, -1.3426e+00,  4.7916e-01,  ...,  8.8798e-01,
          -1.3014e+00,  2.9261e-03],
         [-1.4443e+00, -9.3250e-01,  1.2280e+00,  ..., -1.1358e+00,
          -2.5559e-01, -1.8362e+00],
         ...,
         [ 1.1481e+00,  8.5070e-02, -5.9660e-01,  ...,  1.5729e+00,
          -7.7696e-02,  8.8113e-01],
         [ 9.1666e-01,  5.7564e-01, -2.0591e+00,  ...,  1.4346e+00,
           4.7820e-01, -8.5344e-01],
         [-4.2563e-01,  3.1921e+00, -1.1351e+00,  ...,  8.1564e-01,
          -1.2791e+00,  6.5261e-01]]], grad_fn=<AddBackward0>)

In [40]:
trans_block = TransformerBlock(options='decoder')
trans_block({'encoder_output':torch.randn(1,10,d_model),'w':torch.randn(1,10,d_model)},
    {'self_attention_mask': None, 'cross_attention_mask': None}
)

tensor([[[-0.6512,  0.5878,  0.1309,  ...,  0.6954,  0.0358, -0.2004],
         [ 1.1554, -1.0452, -0.0026,  ...,  0.1563, -0.9811, -0.5007],
         [ 0.4417,  1.1563, -1.4752,  ..., -1.5348,  0.6136, -0.1436],
         ...,
         [-0.9515, -0.9044,  0.1165,  ..., -0.1490,  0.5223,  0.7547],
         [-0.4491, -0.6069,  1.8417,  ...,  0.5312, -1.4932, -0.9565],
         [ 0.4803,  1.2127, -0.4004,  ...,  0.2136,  0.6329, -0.1688]]],
       grad_fn=<AddBackward0>)

- **Encoder** includes Input Pre-processing (**Token Embedding** & **Positional Encoding**) and N **Transformer Block** (Encode block in the picture on the top)

In [41]:
class Encoder(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        """
        The Encoder part of the Transformer architecture
        """
        super().__init__()

        # Layer
        self.input_preprocessing = Preprocessing(max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout)
        
        # Transformer Blocks
        self.transformer_blocks = replicate(TransformerBlock(d_model, num_heads, ff_hidden, dropout, options="encoder"),num_blocks)

    def forward(self, x, self_attention_mask, start_token:bool, end_token:bool):
        # Input Pre-processing: Token Embedding + Positional Encoding
        out = self.input_preprocessing(x, start_token, end_token)

        # Go to Transformer Blocks (Encode)
        for block in self.transformer_blocks:
            out = block(out, self_attention_mask)

        return out

In [44]:
# Test
special_tokens = ["<start>", "<end>", "<pad>"]
alphabet = list("abcdefghijklmnopqrstuvwxyz ") + special_tokens
language_to_index = {word: idx for idx, word in enumerate(alphabet)}
encoder = Encoder(d_model=100, ff_hidden=50, num_heads=2, dropout=0.1 ,num_blocks=1, max_length_seq=100, language_to_index=language_to_index, start_token="<start>", end_token="<end>", pad_token="<pad>")
encoder.to(get_device())
batch = ['hello','goodbye']
encoder(batch, None, False, False)

tensor([[[-0.6574,  0.9927,  0.3610,  ...,  0.8072, -0.1423,  0.5075],
         [-0.3589, -0.4645, -0.4618,  ..., -1.4731, -0.8579, -0.5986],
         [-0.5117, -0.9965,  0.6796,  ..., -0.4416, -2.3177,  0.0577],
         ...,
         [ 0.3102, -2.5761,  0.1540,  ...,  0.3245,  0.1516,  1.6019],
         [-0.6325, -2.8122,  0.6494,  ...,  0.9408,  0.3387, -0.1487],
         [-0.5644, -1.7319,  1.4319,  ...,  1.2308,  0.4343,  0.3827]],

        [[-0.9519,  0.0186, -0.8765,  ...,  0.9950, -0.4711, -0.3555],
         [-0.1444, -0.9346,  0.5741,  ...,  0.5381, -0.7798, -0.9178],
         [ 0.6244, -1.1338,  0.9847,  ...,  1.0216, -0.6965, -0.0603],
         ...,
         [ 0.0494, -2.3624, -0.3773,  ..., -0.0252,  0.5539,  1.3958],
         [-0.9273, -2.5355,  0.8928,  ...,  0.7336,  0.5160,  1.4160],
         [-0.9643, -1.7177,  1.7249,  ...,  0.7898,  0.3357,  0.3674]]],
       device='cuda:0', grad_fn=<AddBackward0>)

- **Decoder** includes **Output Pre-processing** (**Token Embedding** & **Positional Encoding**), N **Transformer Block**

In [45]:
class Decoder(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        """
        The Decoder part of the Transformer architecture

        """
        super().__init__()
        
         # Layer
        self.output_preprocessing = Preprocessing(max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout)
        
        # Transformer Blocks
        self.transformer_blocks = replicate(TransformerBlock(d_model, num_heads, ff_hidden, dropout, options="decoder"),num_blocks)

    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token:bool, end_token:bool): 
        # x is output, y is output from encoder
        # Output Pre-processing: Token Embedding + Positional Encoding
        x = self.output_preprocessing(x, start_token, end_token)

        # Go to Transformer Blocks (Decode)
        encode_decode = {'encoder_output': y,'w':x}
        mask = {'self_attention_mask': self_attention_mask,'cross_attention_mask': cross_attention_mask}
        for block in self.transformer_blocks:
            encode_decode['w'] = x
            x = block(encode_decode, mask)
        return x

## Finally, The Transformer Architecture is complete!

In [21]:
class Transformer(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 target_language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        super().__init__()

        # Device
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        
        # Encoder
        self.encoder = Encoder(
            d_model=d_model,
            ff_hidden=ff_hidden,
            num_heads=num_heads,
            dropout=dropout,
            num_blocks=num_blocks,
            max_length_seq=max_length_seq,
            language_to_index=language_to_index,
            start_token=start_token,
            end_token=end_token,
            pad_token=pad_token
        )

        # Decoder
        self.decoder = Decoder(
            d_model=d_model,
            ff_hidden=ff_hidden,
            num_heads=num_heads,
            dropout=dropout,
            num_blocks=num_blocks,
            max_length_seq=max_length_seq,
            language_to_index=target_language_to_index,
            start_token=start_token,
            end_token=end_token,
            pad_token=pad_token
        )

        # Linear Layer
        self.linear = nn.linear(d_model, len(target_language_to_index))

        # Softmax
        

    def forward(self,
               x,
               y,
               encoder_self_attention_mask=None,
               decoder_self_attention_mask=None, 
            decoder_cross_attention_mask=None,):

In [22]:
# Test
src_vocab_size = 11
target_vocab_size = 11
num_blocks = 6
seq_len = 12

# let 0 be sos token and 1 be eos token
src = torch.tensor([[0, 2, 5, 6, 4, 3, 9, 5, 2, 9, 10, 1],
                    [0, 2, 8, 7, 3, 4, 5, 6, 7, 2, 10, 1]])
target = torch.tensor([[0, 1, 7, 4, 3, 5, 9, 2, 8, 10, 9, 1],
                       [0, 1, 5, 6, 2, 4, 7, 6, 2, 8, 10, 1]])

print(src.shape, target.shape)
model = Transformer(d_model=512,
                    vocab_size=src_vocab_size,
                    target_vocab_size=target_vocab_size,
                    max_length_seq=seq_len,
                    num_blocks=num_blocks,
                    expansion_factor=4,
                    num_heads=8
                   )

print(model)
out = model(src, target)
print(f"Output Shape: {out.shape}")

torch.Size([2, 12]) torch.Size([2, 12])
Transformer(
  (encoder): Encoder(
    (dropout): Dropout(p=0.1, inplace=False)
    (token_emb): TokenEmbedding(
      (embedding_layer): Embedding(11, 512)
    )
    (pos_encode): PositionalEncoding(
      (dropout): Dropout(p=0, inplace=False)
    )
    (transformer_blocks): ModuleList(
      (0-5): 6 x TransformerBlock(
        (multihead_attention): MultiHeadAttention(
          (query): Linear(in_features=64, out_features=64, bias=False)
          (key): Linear(in_features=64, out_features=64, bias=False)
          (value): Linear(in_features=64, out_features=64, bias=False)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        