In [3]:
# In this notebook, you learn:
#
# 1) How all the building blocks of the transformer fit together to make a machine translation model?
# 2) Verfiy that the number of parameters within the built model is the same as number of parameters
#    expected within out model architecture ie.,
#           --> assert parameter_count (built_model) == parameter_count (count_manually). 
#
# A LOT OF THE CODE IN THIS NOTEBOOK IS COPIED FROM THE PREVIOUS NOTEBOOKS. I WILL NOT BE EXPLAINING 
# THOSE PARTS IN THIS NOTEBOOK AGAIN.

In [None]:
# Resources to help understand this notebook:
#
# 1) https://nlp.seas.harvard.edu/annotated-transformer/
#       --  The very best resource to understand the transformer code.
# 2) http://jalammar.github.io/illustrated-transformer/
#       -- Another great resource to understand the architecture of the transformer model.
# 3) https://www.youtube.com/watch?v=8krd5qKVw-Q
#       -- Gives an intuitive explanation of Xavier Initialization.
# 4) https://www.deeplearning.ai/ai-notes/initialization/index.html
#       -- Gives an in-depth mathematical explanation of Xavier Initialization.

In [4]:
from torch import nn, Tensor
from typing import Optional, Tuple

import copy
import math
import torch

<img src="../../Data/Images/TranslationTransformerModel.png" alt="Translation Transformer Model" width="600" height="500">

In [None]:
# credits: The above image is taken from this blog post: https://jalammar.github.io/illustrated-transformer/

In [5]:
# The transformer model is a stack of N encoder-decoder layers.
# In our implementation, each of the ENCODER boxes shown in the above image is referred to as an 
# EncoderLayer and each of the DECODER boxes is referred to as a DecoderLayer. The stack of 6 ENCODER 
# boxes (EncoderLayers) is referred to as an Encoder and the stack of 6 DECODER boxes (DecoderLayers) 
# is referred to as a Decoder. The entire model is referred to as a Transformer.
#
# The bottom EncoderLayer receives the tokenized src sentence embeddings and the bottom DecoderLayer 
# receives the tokenized tgt sentence embeddings. The sentences are tokenized in the Data Preparation
# phase which is not part of the Transformer model below. The tokens are converted into embeddings, 
# aggregated with the positional encodings and passed to the bottom EncoderLayer and bottom DecoderLayer 
# as inputs.
#
# The output of the ENCODER (the last EncoderLayer) is passed to each of the DecoderLayers for src 
# attention calculation. The output of the DECODER (the last DecoderLayer) is passed to a linear 
# layer which projects the Decoder output to the vocab space (target vocab space). This output is then 
# passed to the softmax layer to get the token output probabilities which is the output of the 
# Transformer model as a whole.

<img src="../../Data/Images/EncoderDecoder.png" alt="EncoderDecoder" width="550" height="450">

In [None]:
# credits: The above image is taken from this blog post: https://jalammar.github.io/illustrated-transformer/

In [6]:
# Constants to be used in the model.
# Size of the embedding vectors in the model. This is 512 in the transformer paper.
d_model = 8
# Number of layers in the Encoder in the Encoder and Decoder. The transformer paper also uses 6. The 
# number of layers could be different in Encoder and Decoder but we use the same value for both.
num_layers = 6
# Probability with which to drop data in the transformer model. We will use the same dropout_prob
# throughout the model.
dropout_prob = 0.1
# Number of attention heads in each of the multi-head attention layers in the model. The transformer
# paper uses 8.
num_heads = 8
# Number of neurons in the hidden layer (that expands the input) in the feed forward neural network.
# The transformer paper uses 2048.
d_feed_forward = 16
# Number of sentences in each batch of data.
batch_size = 2
# Number of tokens in the src vocab. This is the number of unique words in the src language (English).
src_vocab_size = 6
# Number of tokens in the tgt vocab. This is the number of unique words in the tgt language (Telugu).
tgt_vocab_size = 6
# Number of tokens in each of the sentences in the batch. We will pad the sentences to make them all
# of the same length. This is the length of the longest sentence in the batch.
seq_len = 4
# Maximum number of tokens in the sentences among all the batches in the entire dataset.
max_seq_len = 10

In [7]:
# EVERYTHING IN THIS CELL HAS BEEN EXPLAINED IN DETAIL IN THE PREVIOUS NOTEBOOKS. PLEASE REFER TO THE EARLIER
# NOTEBOOKS TO UNDERSTAND THE CODE IN THIS CELL. YOU CAN SKIP (JUST RUN IT BLINDLY) THIS CELL AND MOVE TO THE 
# NEXT CELL DIRECTLY. 

# Refer to 'step_8_word_embeddings.ipynb' notebook to learn more about the Embeddings class.
class Embeddings(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        """Creates the embedding layer that serves as a look-up table for the tokens in the transformer model.

        Args:
            vocab_size (int): Size of the vocabulary i.e., number of distinct tokens in the vocabulary.
            embedding_dim (int): The size of the embedding vector to be generated for each token.
        """
        super(Embeddings, self).__init__()
        self.look_up_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

    # The input is be a '2D' tensor where each '1D' tensor within the '2D' tensor is the list
    # of indices corresponding to the tokens in the vocab.
    # [[0, 123, 3455, 4556, 7, 1, 2, 2], [0, 56, 98, 6234, 909, 56, 1, 2]]
    # 0 - <SOS>, 1 - <eos>, 2 - <pad>
    def forward(self, input: Tensor) -> Tensor:
        """Converts the input tensor of token indices to their corresponding embedding vectors.

        Args:
            input (Tensor): The input tensor of token indices.
                            shape: [batch_size, seq_len]

        Returns:
            Tensor: The tensor of embedding vectors for the corresponding input tokens.
                    shape: [batch_size, seq_len, embedding_dim]
        """
        # There is no reasoning as to why the original 'attention_is_all_you_need' paper scaled the
        # embeddings using 'math.sqrt(embedding_dim)'. A few blogs attempted to explain this reasoning,
        # but I haven't found anything with solid reasoning.
        return self.look_up_table(input) * math.sqrt(self.embedding_dim)


# Refer to 'step_10_positional_encoding.ipynb' notebook to learn more about the PositionalEncoding class.
class PositionalEncoding(nn.Module):
    # d_model is the same as encoding_size.
    def __init__(self, encoding_size: int, dropout_prob: float, max_len: int = 5000):
        """Creates the positional encodings.

        Args:
            encoding_size (int): Size of the positional encoding vector that represents the position of the token.
            dropout_prob (float): Probability of an element to be zeroed or dropped.
            max_len (int, optional): Largest position for which the positional encoding vector is generated. Defaults to 5000.
                                     By default, it generates positional encodings for the first 5000 positions.
        """
        super().__init__()
        # Refer to step_8_drop_out.ipynb notebook (link to the notebook) to understand more about dropout.
        self.dropout = nn.Dropout(p=dropout_prob, inplace=False)
        # Compute the positional encodings in log space.
        positional_encoding = torch.zeros(size=(max_len, encoding_size), dtype=torch.float)
        positional_encoding_numerators = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        numerators_in_exponent = torch.arange(0, encoding_size, 2, dtype=torch.float)
        positional_encoding_denominators = torch.exp(numerators_in_exponent * (-math.log(10000.0) / encoding_size))
        positional_encoding[:, 0::2] = torch.sin(positional_encoding_numerators * positional_encoding_denominators)
        positional_encoding[:, 1::2] = torch.cos(positional_encoding_numerators * positional_encoding_denominators)
        # Refer to understanding_tensor_manipulations_part_1.ipynb notebook (link to the notebook) to
        # understand more about unsqueeze operation in pytorch.
        # In transformer model, we receive 3D tensors as input to this module. Each 1D tensor
        # in the last dimension is an embedding for the token. Each 2D tensor is a sentence.
        # The entire 3D tensor is a batch of sentences. To work with 3D tensors in the forward
        # method, we convert the positional encoding to a 3D tensor.
        positional_encoding = positional_encoding.unsqueeze(0)
        # Refer to using_modules.ipynb (link to the notebook) to understand more about buffers in pytorch.
        # This tells the module to not update the positional encoding tensor during the training. It is 
        # not a trainable parameter but it is still part of the state of the model.
        self.register_buffer('positional_encoding', positional_encoding)
    
    def forward(self, input: Tensor) -> Tensor:
        """Adds the positional encodings to the input tensor.
        Args:
            input (Tensor): The input tensor containing the embeddings of the tokens.
                            shape: [batch_size, sentence_length, d_model]

        Returns:
            Tensor: Input with the positional encodings added to it.
                    shape: [batch_size, sentence_length, d_model]
                    d_model is the same as encoding_size.
        """
        # Refer to understanding_tensor_manipulations_part_5.ipynb notebook (link to the notebook) to 
        # understand more about broadcasting in python.
        # The input tensor is a 3D tensor of shape (batch_size, sentence_length, encoding_size).
        # We add (uses broadcasting) the positional encoding to the input tensor to get the final tensor.
        # positional_encoding: (1, max_len, encoding_size) --> (1, sentence_length, encoding_size) 
        #       -- Extracts the positional encodings for the sentence_length from the positional_encoding 
        #          tensor.
        # (batch_size, sentence_length, encoding_size) --> input
        # (batch_size, sentence_length, encoding_size) --> Resultant tensor shape after broadcasting.
        # requires_grad_(False) is not needed since the positional encoding is already registered
        # as a Buffer and not a trainable parameter. It is just included for clarity.
        input = input + self.positional_encoding[:, :input.size(1)].requires_grad_(False)
        return self.dropout(input)
    

# Creates a copy (deepcopy) of the module and returns ModuleList containing the copies.
def clone_module(module: nn.Module, num_clones: int) -> nn.ModuleList:
    return nn.ModuleList([copy.deepcopy(module) for _ in range(num_clones)])


# Refer to 'step_11_multi_headed_attention.ipynb' notebook to understand how this function works.
def construct_attention_heads(queries: Tensor, keys: Tensor, values: Tensor, mask: Optional[Tensor]=None, dropout_layer: Optional[nn.Module]=None) -> Tuple[Tensor, Tensor]:
    """Calculates the attention scores for each token in the sequence with every other token in the sequence.
       Applues the mask if provided and then normalizes the scores using softmax. It then calculates the 
       attention heads for each token in the sequence.

    Args:
        queries (Tensor): [batch_size, num_heads, seq_len, d_k]
        keys (Tensor): [batch_size, num_heads, seq_len, d_k]
        values (Tensor): [batch_size, num_heads, seq_len, d_k]
        mask (Optional[Tensor], optional): [batch_size, 1, seq_len, seq_len]. Defaults to None.
        dropout_layer (Optional[nn.Module], optional): probability with which the values are dropped on dropout layer. Defaults to None.

    Returns:
        Tuple[Tensor, Tensor]: Returns the attention heads and the attention scores.
                               attention_heads: [batch_size, num_heads, seq_len, d_k]
                               attention_scores: [batch_size, num_heads, seq_len, seq_len]
    """
    # Size of the vectors for each token for each head in the sequence.
    d_k = queries.shape[-1]
    # Calculate the attention scores for each token in the sequence with every other token in the sequence.
    attention_scores = torch.matmul(queries, keys.transpose(dim0=2, dim1=3)) / math.sqrt(d_k)
    # Mask the attention scores if a mask is provided. Mask is used in two different ways:
    # 1) To prevent the model from attending to the padding tokens --> This applies for both src and tgt sentences.
    # 2) To prevent the model from attending to the future tokens in the sequence --> This applies only for tgt sentences.
    if mask is not None:
        # Please do not set the masked values to float('-inf') as it sometimes (not in everycase) causes softmax to return nan.
        attention_scores = attention_scores.masked_fill(mask == False, float('-1e9'))
    # Normalize the attention scores using softmax.
    attention_scores = attention_scores.softmax(dim=-1)
    # Apply dropout regularization to prevent overfitting problems.
    if dropout_layer is not None:
        dropout_layer(attention_scores)
    # Calculate the attention heads for each token in the sequence. The head for each token is calculated by
    # taking the weighted average (averaged by attention scores) of the values for all the tokens in the 
    # sequence for the token of interest.
    attention_heads = torch.matmul(attention_scores, values)
    return attention_heads, attention_scores


# Refer to 'step_11_multi_headed_attention.ipynb' notebook to understand how this class works.
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_heads: int, d_model: int, dropout_prob: float = 0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads."
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        # We use dropout to prevent overfitting.
        self.dropout_layer = nn.Dropout(p=dropout_prob)
        # Creating the linear layers that generate queries, keys and values for each token in the sequence.
        # Also, creating an additional linear layer to generate the output of the Multi-Headed Attention from concatenated attention heads.
        self.linear_layers = clone_module(module=nn.Linear(in_features=d_model, out_features=d_model), num_clones=4)


    def forward(self, query_input: Tensor, key_input: Tensor, value_input: Tensor, mask: Optional[Tensor]=None) -> Tensor:
        """Forward pass of the Multi-Headed Attention layer. 

        Args:
            query (Tensor): Input to be used for query creation.
                            query_input: [batch_size, seq_len, d_model]
            key (Tensor): Input to be used for key creation.
                          key_input  : [batch_size, seq_len, d_model]
            value (Tensor): Input to be used for value creation.
                            value_input: [batch_size, seq_len, d_model]
            mask (Tensor): Mask to be applied to the attention scores. Default is None. Same mask will 
                           be applied to all the heads in the Multi-Headed Attention layer.
                           mask: [batch_size, 1, seq_len, seq_len]

        Returns:
            Mutli-Headed Attention Output: Output of the Multi-Headed Attention layer. Generates one output vector 
                                           for each token in the sequence. Does this for each sequence in the batch.
                                           output: [batch_size, seq_len, d_model]
        """
        # Generates the queries, keys and values for each token in the sequence.
        # shape of queries, keys, values: [batch_size, seq_len, d_model]
        queries, keys, values = [linear_layer(input) for linear_layer, input in zip(self.linear_layers, (query_input, key_input, value_input))]
        batch_size = query_input.shape[0]
        seq_len = query_input.shape[1]
        # Separating the queries, keys and values for each head into a separate vector. The vectors for each token in all the heads
        # are concatenated when they are created using the linear_layers above.
        # Shape for queries, keys, values after view: [batch_size, seq_len, num_heads, d_k]
        # Shape for queries, key, values after transpose: [batch_size, num_heads, seq_len, d_k]
        queries, keys, values = [data.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(dim0=1, dim1=2) for data in (queries, keys, values)]
        # Calculate the attention heads for each token in the sequence.
        # attention_heads: [batch_size, num_heads, seq_len, d_k]
        attention_heads, attention_scores = construct_attention_heads(queries=queries, keys=keys, values=values, mask=mask, dropout_layer=self.dropout_layer)
        # Concatenate the attention heads for each token from all the heads.
        # attention_heads: [batch_size, seq_len, d_model]
        attention_heads = attention_heads.transpose(dim0=1, dim1=2).reshape(batch_size, seq_len, self.d_model)
        # Generate the output of the Multi-Headed Attention layer.
        return self.linear_layers[-1](attention_heads)
    

# Refer to 'step_12_feed_forward_neural_network.ipynb' notebook to understand how this class works.
class FeedForwardNN(nn.Module):
    def __init__(self, d_model: int, d_feed_forward: int, dropout_prob: float = 0.1):
        super().__init__()
        self.linear_layer_1 = nn.Linear(in_features=d_model, out_features=d_feed_forward)
        self.linear_layer_2 = nn.Linear(in_features=d_feed_forward, out_features=d_model)
        self.dropout_layer = nn.Dropout(p=dropout_prob)

    def forward(self, input: Tensor) -> Tensor:
        """Passes the input through the Feed Forward Neural Network and returns the output 
           of the neural network.

        Args:
            input (Tensor): The output of the Multi-Headed Attention layer.
                            shape: [batch_size, seq_len, d_model]

        Returns:
            Tensor: The output of the Feed Forward Neural Network.
                    shape: [batch_size, seq_len, d_model]
        """
        # We first expand the input to higher dimension. We apply the ReLU activation function in this layer.
        intermediate_output = self.linear_layer_1(input).relu()
        # Dropout layer to prevent overfitting
        intermediate_output = self.dropout_layer(intermediate_output)
        # We then compress the input back to its original dimension. There is no specific intuitive explanation 
        # as to why this is done. It is just shown to be working practically in neural networks in general and 
        # in this paper in particular.
        return self.linear_layer_2(intermediate_output)
    

# Refer to 'step_14_encoder.ipynb' to understand how this class works.
class SubLayerWrapper(nn.Module):
    def __init__(self, d_model: int, dropout_prob: float):
        """This class is a wrapper around the MultiHeadedAttention and PositionwiseFeedForward classes.

        Args:
            d_model (int): Dimension of the vectors used in the Attention model.
            dropout_prob (float): probability with which nodes can be dropped.
        """
        super().__init__()
        self.dropout = nn.Dropout(dropout_prob)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, input: Tensor, sublayer: nn.Module) -> Tensor:
        """It applies the operation on the input, applies dropout, adds the input back to the transformed 
           input, does normalization and returns the output.

        Args:
            input (Tensor): Input to be transformer by the sublayer.
                            input: [batch_size, seq_len, d_model]
            sublayer (nn.Module): sublayer could be either MultiHeadedAttention or PositionwiseFeedForward.
            
        Returns:
            Tensor: Output of the sublayer transformation.
                    output: [batch_size, seq_len, d_model]
        """
        return self.layer_norm(input + self.dropout(sublayer(input)))


# Refer to 'step_14_encoder.ipynb' to understand how this class works.
class EncoderLayer(nn.Module):
    def __init__(self, self_attention: MultiHeadedAttention, feed_forward: FeedForwardNN, d_model: int, dropout_prob: float):
        super().__init__()
        self.d_model = d_model
        self.dropout_prob = dropout_prob
        # These modules are now the child modules of the EncoderLayer and will be registered as parameters of the EncoderLayer.
        self.self_attention = self_attention
        self.feed_forward = feed_forward
        self.sublayer_wrappers = clone_module(module=SubLayerWrapper(d_model=self.d_model, dropout_prob=self.dropout_prob), num_clones=2)

    def forward(self, input: Tensor, mask: Tensor) -> Tensor:
        """This method is the forward pass of the EncoderLayer class.

        Args:
            input (Tensor): Source sentence provided as input to the EncoderLayer. These are the embeddings of the source 
                            sentence for the first EncoderLayer.
                            SHAPE: [batch_size, seq_len, d_model]
            mask (Tensor): Boolean mask to be applied to the input during attention scores calculation.
                           SHAPE: [batch_size, 1, seq_len, seq_len]
        Returns:
            Tensor: Output of the EncoderLayer.
                    SHAPE: [batch_size, seq_len, d_model]
        """
        # We are just saving the function call to the self_attention method in a variable and passing the
        # lambda function (contained within the variable) to the sublayer_wrappers[0] to execute it when 
        # needed.
        output = self.sublayer_wrappers[0](input, lambda input: self.self_attention(query_input=input, key_input=input, value_input=input, mask=mask))
        return self.sublayer_wrappers[1](output, self.feed_forward)


# Refer to 'step_14_encoder.ipynb' to understand how this class works.
class Encoder(nn.Module):
    def __init__(self, encoder_layer: EncoderLayer, num_layers: int):
        super().__init__()
        self.encoder_layers = clone_module(module=encoder_layer, num_clones=num_layers)
        self.layer_norm = nn.LayerNorm(encoder_layer.d_model)

    def forward(self, input: Tensor, mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the Encoder class. The output of the current EncoderLayer is
           passed as input to the next EncoderLayer. We have 6 identical EncoderLayers stacked on top of 
           each other. The output of the last EncoderLayer is passed through a Layer Normalization layer
           and returned as the final output of the Encoder

        Args:
            input (Tensor): Input to the Encoder i.e., embeddings of the tokenized src sequences.
                            input: [batch_size, seq_len, d_model]
            mask (Optional[Tensor], optional): Boolean mask to be applied during attention scores calculation.
                                               mask: [batch_size, 1, seq_len, seq_len]. Defaults to None.
                            
        Returns:
            Tensor: Output of the Encoder i.e., encoded src sentences.
                    output: [batch_size, seq_len, d_model]
        """
        output = input
        for encoder_layer in self.encoder_layers:
            # Pass the output of the previous EncoderLayer to the current EncoderLayer.
            output = encoder_layer(input=output, mask=mask)
        return self.layer_norm(output)


# Refer to 'step_15_decoder.ipynb' to understand how this class works.
class DecoderLayer(nn.Module):
    def __init__(self, self_attention: MultiHeadedAttention, src_attention: MultiHeadedAttention, feed_forward: FeedForwardNN, d_model: int, dropout_prob: float):
        super().__init__()
        self.d_model = d_model
        self.dropout_prob = dropout_prob
        # These modules are now the child modules of the DecoderLayer and will be registered as parameters of the DecoderLayer.
        self.self_attention = self_attention
        self.src_attention = src_attention
        self.feed_forward = feed_forward
        self.sublayer_wrappers = clone_module(module=SubLayerWrapper(d_model=d_model, dropout_prob=dropout_prob), num_clones=3)

    def forward(self, input: Tensor, encoded_src: Tensor, tgt_mask: Tensor, src_mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the DecoderLayer class.

        Args:
            input (Tensor): Target sentence provided as input to the DecoderLayer. These are the embeddings of the target 
                            sentence for the first DecoderLayer.
                            SHAPE: [batch_size, seq_len, d_model]
            encoded_src (Tensor): Encoded source sentence. This is the output of the Encoder. This is used to calculate the
                                  source attention scores for the target sentence. 
                                  SHAPE: [batch_size, seq_len, d_model] 
            tgt_mask (Tensor): Mask to prevent the future tokens in the target sentence to attend to the previous tokens and
                               also to prevent padding tokens from attending to any other token except other padding tokens.
                               SHAPE: [batch_size, 1, seq_len, seq_len]
            src_mask (Optional[Tensor], optional): Mask to prevent the the padding tokens to attend to the tokens in the tgt sentence. 
                                                   Defaults to None.
                                                   SHAPE: [batch_size, 1, seq_len, seq_len]

        Returns:
            Tensor: Returns the output of the DecoderLayer. This is the output of the Positionwise FeedForward Neural Network.
                    SHAPE: [batch_size, seq_len, d_model]
        """
        # First sublayer: Self-Attention on the target sentence. Hence, it uses the tgt_mask.
        self_attention_output = self.sublayer_wrappers[0](input=input, sublayer=lambda input: self.self_attention(query_input=input, key_input=input, value_input=input, mask=tgt_mask)) 
        # To give intuition about src_attention, I have a query for a token in the target sentence. I want to know whether 
        # some token in the source sentence is important for me to predict the output for the token in the target sentence. 
        # So, I go to the source sentence and get the values for all the tokens in the source sentence. I then calculate 
        # the attention scores between the query (in tgt) and the keys (in src). I then calculate the attention heads for 
        # the token in the target sentence using the attention scores. This is what is done in the below line. Note that 
        # referring to statement 'the keys and values are from the source' doesn't mean that you get keys and values 
        # explicitly. It means we use the encoded data from the source sentence to calculate the queries and keys for 
        # this transformation.
        # Second sublayer: Attention on the source sentence. Hence, it uses the src_mask.
        src_attention_output = self.sublayer_wrappers[1](input=self_attention_output, sublayer=lambda self_attention_output: self.src_attention(query_input=self_attention_output, key_input=encoded_src, value_input=encoded_src, mask=src_mask))
        # Third sublayer: Positionwise FeedForward Neural Network
        return self.sublayer_wrappers[2](input=src_attention_output, sublayer=self.feed_forward)


# Refer to 'step_15_decoder.ipynb' to understand how this class works.
class Decoder(nn.Module):
    def __init__(self, decoder_layer: DecoderLayer, num_layers: int):
        super().__init__()
        self.decoder_layers = clone_module(module=decoder_layer, num_clones=num_layers)
        self.layer_norm = nn.LayerNorm(decoder_layer.d_model)

    def forward(self, input: Tensor, encoded_src: Tensor, tgt_mask: Tensor, src_mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the Decoder class. The output of the current DecoderLayer is
           passed as input to the next DecoderLayer. We have 6 identical DecoderLayers stacked on top of 
           each other. The output of the Encoder (last EncoderLayer) is also passed as input to the 
           first DecoderLayer. The output of the last DecoderLayer is passed through a Layer Normalization 
           layer and returned as the final output of the Decoder.

        Args:
            input (Tensor): Input to the Decoder i.e., embeddings of the tokenized tgt sequences.
                            input: [batch_size, seq_len, d_model]
            encoded_src (Tensor): output of the encoder i.e., encoded src sequences.
            tgt_mask (Tensor): Boolean mask to be applied during self attention scores calculation.
                               tgt_mask: [batch_size, 1, seq_len, seq_len].
            src_mask (Optional[Tensor], optional): Boolean mask to be applied during src attention scores calculation.
                                                   tgt_mask: [batch_size, 1, seq_len, seq_len]. Defaults to None.

        Returns:
            Tensor: Output of the Decoder.
                    output: [batch_size, seq_len, d_model]
        """
        output = input
        for decoder_layer in self.decoder_layers:
            # Pass the output of the previous DecoderLayer to the current DecoderLayer.
            output = decoder_layer(input=output, encoded_src=encoded_src, tgt_mask=tgt_mask, src_mask=src_mask)
        return self.layer_norm(output)


# Refer to 'step_14_token_predictor.ipynb' to understand how this class works.
class TokenPredictor(nn.Module):
    def __init__(self, d_model: int, tgt_vocab_size: int):
        super(TokenPredictor, self).__init__()
        self.d_model = d_model
        self.vocab_size = tgt_vocab_size
        self.linear = nn.Linear(in_features=d_model, out_features=tgt_vocab_size)
        # The non-module variables are not added to the list of parameters of the model.
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, decoder_output: Tensor) -> Tensor:
        """The forward pass of the output generator. Calculates the probability distribution over the 
           vocabulary. Each token vector has a corresponding probability distribution over the 
           vocabulary since we predict one token per output.

        Args:
            decoder_output (Tensor): Output of the Decoder.
                                     shape: [batch_size, seq_len, d_model]

        Returns:
            Tensor: Log probability distribution over the vocabulary. 
                    shape: [batch_size, seq_len, vocab_size]
        """
        # Project the decoder output to the vocab_size dimensional space.
        logits = self.linear(decoder_output)
        # Convert the logits to a probability distribution over the vocabulary. All the entires in the
        # output tensor are negative since we are using log softmax. The log softmax is used to make
        # the training more numerically stable. However, the maximum value is still the same as the 
        # maximum value of the original softmax output.
        return self.log_softmax(logits)

## The Transformer model which is the main learning part of this notebook.

In [8]:
# We will now create the Transformer model by logically combining all the components we have created so far.
class MachineTranslationModel(nn.Module):
    """Model that combines the Encoder, Decoder and the TokenPredictor to create a machine translation Transformer model."""

    def __init__(self, d_model: int, d_feed_forward: int, dropout_prob: float, num_heads: int, src_vocab_size: int, tgt_vocab_size: int, num_layers: int, max_seq_len: int):
        """Initializes the Transformer model.

        Args:
            d_model (int): size of the embedding vectors in the model.
            d_feed_forward (int): Number of neurons in the hidden layer of the feed forward neural network.
            dropout_prob (float): probability with which to drop data in the transformer model.
            num_heads (int): number of attention heads in each of the multi-head attention layers in the model.
            src_vocab_size (int): size of the source vocabulary.
            tgt_vocab_size (int): size of the target vocabulary.
            num_layers (int): number of layers in the Encoder and Decoder.
            max_seq_len (int): Maximum length of the sequence that is ever input to the model.
        """
        super(MachineTranslationModel, self).__init__()
        self.src_embedding = Embeddings(vocab_size=src_vocab_size, embedding_dim=d_model)
        self.tgt_embedding = Embeddings(vocab_size=tgt_vocab_size, embedding_dim=d_model)
        self.positional_encoding = PositionalEncoding(encoding_size=d_model, dropout_prob=dropout_prob, max_len=max_seq_len)
        multi_headed_attention = MultiHeadedAttention(num_heads=num_heads, d_model=d_model, dropout_prob=dropout_prob)
        feed_forward_nn = FeedForwardNN(d_model=d_model, d_feed_forward=d_feed_forward, dropout_prob=dropout_prob)
        encoder_layer = EncoderLayer(self_attention=copy.deepcopy(multi_headed_attention), 
                                     feed_forward=copy.deepcopy(feed_forward_nn), 
                                     d_model=d_model, 
                                     dropout_prob=dropout_prob)
        decoder_layer = DecoderLayer(self_attention=copy.deepcopy(multi_headed_attention), 
                                     src_attention=copy.deepcopy(multi_headed_attention),
                                     feed_forward=copy.deepcopy(feed_forward_nn), 
                                     d_model=d_model, 
                                     dropout_prob=dropout_prob)
        self.encoder = Encoder(encoder_layer=encoder_layer, num_layers=num_layers)
        self.decoder = Decoder(decoder_layer=decoder_layer, num_layers=num_layers)
        self.token_predictor = TokenPredictor(d_model=d_model, tgt_vocab_size=tgt_vocab_size)
        self.initialize_model_parameters()

    def initialize_model_parameters(self):
        """Initializes the parameters of the model using the Xavier Uniform initialization."""
        for params in self.parameters():
            # This is to ensure the only the weights are initialized and not the biases. biases usually have only
            # one dimension and the weights have more than one dimension.
            if params.dim() > 1:
                nn.init.xavier_uniform_(params)

    def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor, tgt_mask: Tensor) -> Tensor:
        """The forward pass of the Transformer model. The source sentences are passed through the Encoder and the target
           sentences are passed through the Decoder. The output of the Decoder is passed through the token predictor to
           get the probability distribution over the target vocabulary.

        Args:
            src (Tensor): Source sentences (English) containing the token ids corresponding to the indices in the src vocabulary. 
                          Example input looks like [[0, 4, 55, 67, 1, 2, 2], [0, 42, 585, 967, 19, 26, 1]]
                          SHAPE: [batch_size, seq_len]
            tgt (Tensor): Target sentences (Telugu) containing the token ids corresponding to the indices in the tgt vocabulary. 
                          Example input looks like [[0, 3, 5, 677, 81, 1, 2], [0, 7, 67, 190, 3245, 1]]
                          SHAPE: [batch_size, seq_len - 1]
            src_mask (Tensor): Mask to be applied to the source sentences in each of the attention heads.
                               src_mask: [batch_size, 1, seq_len, seq_len]
            tgt_mask (Tensor): Mask to be applied to the target sentences in each of the attention heads.
                               tgt_mask: [batch_size, 1, seq_len - 1, seq_len - 1]

        Returns:
            Tensor: Log probability distribution over the tokens in the target vocabulary (Telugu vocabulary).
                    SHAPE: [batch_size, seq_len - 1, tgt_vocab_size]
        """
        # Pass the source sentences through the encoder to get the encoded source token vectors.
        encoded_src = self.encode(src=src, src_mask=src_mask)
        # Pass the target sentence through the decoder to get the encoded target token vectors.
        decoded_tgt = self.decode(tgt=tgt, tgt_mask=tgt_mask, encoded_src=encoded_src, src_mask=src_mask)
        return self.generate_tgt_token_prob_distributions(decoded_tgt=decoded_tgt)

    def encode(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """Encodes the source sentences (English).

        Args:
            src (Tensor): A batch of source sentences containing the token ids corresponding to the indices in the src vocabulary.
                          SHAPE: [batch_size, seq_len]
            src_mask (Tensor): Mask to be applied to the source sentences in each of the attention heads. Same mask will be 
                               applied to the sentence in all the attention heads.
                               SHAPE: [batch_size, 1, seq_len, seq_len]

        Returns:
            Tensor: Encoded source sentences. Each token in the source sentence is represented by a vector that encodes
                    all the information about the token and its relationship with other tokens in the sentence.
                    SHAPE: [batch_size, seq_len, d_model]
        """
        # Get the embeddings for the source sentences.
        src_embeddings = self.src_embedding(src)
        # Add the positional encodings to the embeddings.
        src_embeddings = self.positional_encoding(src_embeddings)
        # Pass the source sentence through the encoder.
        encoded_src = self.encoder(input=src_embeddings, mask=src_mask)
        return encoded_src

    def decode(self, tgt: Tensor, tgt_mask: Tensor, encoded_src: Tensor, src_mask: Tensor) -> Tensor:
        """Encodes the target sentences (Telugu).

        Args:
            tgt (Tensor): A batch of target sentences containing the token ids corresponding to the indices in the tgt vocabulary.
                          SHAPE: [batch_size, seq_len - 1]
            tgt_mask (Tensor): Mask to be applied to the target sentences in each of the attention heads. Same mask will be 
                               applied to the sentence in all the attention heads.
                               SHAPE: [batch_size, 1, seq_len - 1, seq_len - 1]
            encoded_src (Tensor): The encoded token representations of the source sentences. This is used to calculate the
                                  source attention scores for the target sentence.
                                  SHAPE: [batch_size, seq_len, d_model]
            src_mask (Tensor): Mask to be applied to the source sentences in each of the attention heads. Same mask will be 
                               applied to the sentence in all the attention heads.
                               SHAPE: [batch_size, 1, seq_len, seq_len]

        Returns:
            Tensor: Encoded (or Decoded if that makes more sense to you) target sentences. Each token in the target 
                    sentence is represented by a vector that encodes all the information about the token and its 
                    relationship with other tokens in the target sentence and the corresponding source sentence.
        """
        # Get the embeddings for the target sentences.
        tgt_embeddings = self.tgt_embedding(tgt)
        # Add the positional encodings to the embeddings.
        tgt_embeddings = self.positional_encoding(tgt_embeddings)
        # Pass the target sentence through the decoder.
        decoded_tgt = self.decoder(input=tgt_embeddings, encoded_src=encoded_src, tgt_mask=tgt_mask, src_mask=src_mask)
        return decoded_tgt

    def generate_tgt_token_prob_distributions(self, decoded_tgt: Tensor) -> Tensor:
        # Convert the output of the decoder to the probability distribution over the target vocabulary. This will be
        # used to calculate the loss in the training phase.
        return self.token_predictor(decoded_tgt)

In [9]:
# THIS CELL CONTAINS SOME HELPER FUNCTIONS TO GENERATE RANDOM DATA FOR TESTING THE MODEL.

# Generates the (src, tgt) data and prints them for visibility.
def PrintTensor(input: Tensor, name: str):
    print("name: ", name)
    print("shape: ", input.shape)
    print(input)
    print("-" * 150)

# The true data will have a specific format that is expected by the model. That part is not considered
# in this random generation. The random generation is just to test the model and not to train it.
def generate_batch_of_input_data(batch_size: int, seq_len: int, vocab_size: int) -> Tensor:
    return torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len))

# The true data will have masks that are corelated with the data. That part is not considered in this
# random generation. The random generation of masks is just to test the model and not to train it.
def construct_random_mask(batch_size: int, seq_len: int) -> Tensor:
    # If some index is set to False, then it will be masked out.
    mask = torch.randn(size=(batch_size, 1, seq_len, seq_len)) > 0.5
    return mask.bool()

src = generate_batch_of_input_data(batch_size=batch_size, seq_len=seq_len, vocab_size=src_vocab_size)
PrintTensor(input=src, name="src")
src_mask = construct_random_mask(batch_size=batch_size, seq_len=seq_len)
PrintTensor(input=src_mask, name="src_mask")
tgt = generate_batch_of_input_data(batch_size=batch_size, seq_len=seq_len, vocab_size=tgt_vocab_size)
PrintTensor(input=tgt, name="tgt")
tgt_mask = construct_random_mask(batch_size=batch_size, seq_len=seq_len)
PrintTensor(input=tgt_mask, name="tgt_mask")

name:  src
shape:  torch.Size([2, 4])
tensor([[5, 1, 4, 0],
        [0, 0, 0, 0]])
----------------------------------------------------------------------------------------
name:  src_mask
shape:  torch.Size([2, 1, 4, 4])
tensor([[[[False, False, False, False],
          [False, False, False,  True],
          [False,  True,  True, False],
          [False,  True,  True,  True]]],


        [[[False, False, False, False],
          [False,  True,  True,  True],
          [ True, False, False,  True],
          [False, False, False, False]]]])
----------------------------------------------------------------------------------------
name:  tgt
shape:  torch.Size([2, 4])
tensor([[3, 5, 0, 1],
        [0, 5, 4, 0]])
----------------------------------------------------------------------------------------
name:  tgt_mask
shape:  torch.Size([2, 1, 4, 4])
tensor([[[[ True, False, False, False],
          [False, False, False,  True],
          [ True, False, False, False],
          [ True,  Tru

In [10]:
# Create an instance of the MachineTranslationTransformer model.
machine_translation_transformer = MachineTranslationModel(d_model=d_model, 
                                                          d_feed_forward=d_feed_forward,
                                                          dropout_prob=dropout_prob, 
                                                          num_heads=num_heads, 
                                                          src_vocab_size=src_vocab_size, 
                                                          tgt_vocab_size=tgt_vocab_size, 
                                                          num_layers=num_layers,
                                                          max_seq_len=max_seq_len)
print(machine_translation_transformer)

MachineTranslationTransformer(
  (src_embedding): Embeddings(
    (look_up_table): Embedding(6, 8)
  )
  (tgt_embedding): Embeddings(
    (look_up_table): Embedding(6, 8)
  )
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (encoder_layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_attention): MultiHeadedAttention(
          (dropout_layer): Dropout(p=0.1, inplace=False)
          (linear_layers): ModuleList(
            (0-3): 4 x Linear(in_features=8, out_features=8, bias=True)
          )
        )
        (feed_forward): FeedForwardNN(
          (linear_layer_1): Linear(in_features=8, out_features=16, bias=True)
          (linear_layer_2): Linear(in_features=16, out_features=8, bias=True)
          (dropout_layer): Dropout(p=0.1, inplace=False)
        )
        (sublayer_wrappers): ModuleList(
          (0-1): 2 x SubLayerWrapper(
            (dropout): Dropout(p=0.1, inplace=False)
            

In [11]:
# Pass the input data through the model to get the output.
decoded_tgt = machine_translation_transformer(src=src, tgt=tgt, src_mask=src_mask, tgt_mask=tgt_mask)
PrintTensor(input=decoded_tgt, name="decoded_tgt")

name:  decoded_tgt
shape:  torch.Size([2, 4, 6])
tensor([[[-1.9138, -0.8584, -2.6599, -2.8548, -1.2727, -3.8601],
         [-1.4200, -1.2505, -4.1406, -4.0404, -0.8449, -4.7314],
         [-1.0043, -1.0392, -2.4634, -3.2804, -2.0178, -3.7191],
         [-1.1923, -0.9592, -2.4992, -3.4596, -1.8626, -3.1144]],

        [[-2.7122, -0.6663, -1.8501, -2.1413, -2.4697, -2.8027],
         [-3.3400, -1.3180, -3.3497, -2.2045, -0.6202, -4.2917],
         [-3.4366, -1.2331, -3.0217, -1.9720, -0.7687, -3.6909],
         [-2.7717, -1.1688, -3.5477, -2.7665, -0.6677, -3.8104]]],
       grad_fn=<LogSoftmaxBackward0>)
----------------------------------------------------------------------------------------


## Lets verify if the number of parameters in the model is the same as what we expect.

In [12]:
# Finding out the number of parameters in the build model.
total_params = sum(params.numel() for params in machine_translation_transformer.parameters())
print("total_params: ", total_params)
total_params_with_grad = sum(params.numel() for params in machine_translation_transformer.parameters() if params.requires_grad)
print("total_params_with_grad: ", total_params_with_grad)
total_params_without_grad = sum(params.numel() for params in machine_translation_transformer.parameters() if not params.requires_grad)
print("total_params_without_grad: ", total_params_without_grad)
assert total_params == total_params_with_grad + total_params_without_grad

total_params:  9206
total_params_with_grad:  9206
total_params_without_grad:  0


In [13]:
# Prints out all the layers and the number of parameters in each layer.
for name, params in machine_translation_transformer.named_parameters():
    print(name, " ", params.numel())

src_embedding.look_up_table.weight   48
tgt_embedding.look_up_table.weight   48
encoder.encoder_layers.0.self_attention.linear_layers.0.weight   64
encoder.encoder_layers.0.self_attention.linear_layers.0.bias   8
encoder.encoder_layers.0.self_attention.linear_layers.1.weight   64
encoder.encoder_layers.0.self_attention.linear_layers.1.bias   8
encoder.encoder_layers.0.self_attention.linear_layers.2.weight   64
encoder.encoder_layers.0.self_attention.linear_layers.2.bias   8
encoder.encoder_layers.0.self_attention.linear_layers.3.weight   64
encoder.encoder_layers.0.self_attention.linear_layers.3.bias   8
encoder.encoder_layers.0.feed_forward.linear_layer_1.weight   128
encoder.encoder_layers.0.feed_forward.linear_layer_1.bias   16
encoder.encoder_layers.0.feed_forward.linear_layer_2.weight   128
encoder.encoder_layers.0.feed_forward.linear_layer_2.bias   8
encoder.encoder_layers.0.sublayer_wrappers.0.layer_norm.weight   8
encoder.encoder_layers.0.sublayer_wrappers.0.layer_norm.bias   8

### Counting the number of trainable parameters in the transformer model manually

In [14]:
# Lets try to count the number of parameters in the model manually by going through each component and counting 
# parameters. 
# The number of parameters associated with the Embeddings class for src sentences. We have 1 embedding vector 
# per token in the source vocabulary. We have 6 tokens and each token is represented by an 8-dimensional vector. 
# So, the total number of parameters associated with the Embeddings class for src sentences is 6 * 8 = 48.
num_src_embedding_params = src_vocab_size * d_model
# The number of parameters associated with the Embeddings class for tgt sentences. We have 1 embedding vector
# per token in the target vocabulary. We have 6 tokens and each token is represented by an 8-dimensional vector.
# So, the total number of parameters associated with the Embeddings class for tgt sentences is 6 * 8 = 48.
num_tgt_embedding_params = tgt_vocab_size * d_model
# There are no parameters associated with the PositionalEncoding class. These are calculated based on a predefined
# formula and are not learned during the training process.
num_positional_encoding_params = 0
# Now, lets calculate the number of parameters associated with the Encoder class. The Encoder class has 6 
# identical EncoderLayers stacked on top of each other. Lets calculate the number of parameters associated with 
# each EncoderLayer class. The EncoderLayer class has MultiHeadedAttention and FeedForwardNN classes as its child 
# classes. Each MultiHeadedAttention class has 4 linear layers (query, key, value and output). Note that a single 
# linear layer is used to calculate the queries, keys, values and outputs for all the heads. So, we don't need to 
# do this calculation for each head separately. Lets take the linear layer associated with the query calculation. 
# The input to this linear layer is a 8-dimensional vector (d_model) and the output is also an 8-dimensional vector 
# (d_model=8). So, the number of parameters in this linear layer associated with the weight matrix is 8 * 8 = 64. 
# We also have bias terms (d_model=8) associated with this linear layer. So, the total number of parameters 
# associated with the query linear layer is 64 + 8 = 72.
num_encoder_query_params = d_model * d_model + d_model
num_encoder_key_params = d_model * d_model + d_model
num_encoder_value_params = d_model * d_model + d_model
num_encoder_attention_output_params = d_model * d_model + d_model
# Now lets calculate the number of parameters associated with FeedForward neural network class in the EncoderLayer. 
# The first linear layer in the feed forward expands the input to a higher dimension (d_model to d_feed_forward). 
# The input to this linear layer is a 8-dimensional vector (d_model) and the output is a 16-dimensional vector 
# (d_feed_forward). So, the number of parameters in this linear layer associated with the weight matrix is 
# 8 * 16 = 128. We also have bias terms (d_feed_forward=16) associated with this linear layer. So, the total number 
# of parameters associated with the first linear layer in the feed forward neural network is 128 + 16 = 144. The 
# second linear layer in the feed forward neural network compresses the input back to its original dimension 
# (d_feed_forward to d_model). The input to this linear layer is a 16-dimensional vector (d_feed_forward) and the 
# output is an 8-dimensional vector (d_model). So, the number of parameters in this linear layer associated with 
# the weight matrix is 16 * 8 = 128. We also have bias terms (d_model=8) associated with this linear layer. So, 
# the total number of parameters associated with the second linear layer in the feed forward neural network is 
# 128 + 8 = 136.
num_encoder_feed_forward_linear_layer_1_params = d_model * d_feed_forward + d_feed_forward
num_encoder_feed_forward_linear_layer_2_params = d_feed_forward * d_model + d_model
# The output of MultiHeadedAttention and FeedForward neural network is normalized using Layer Normalization. Layer
# Normalization is applied along the last dimension of the input tensor (input to Layer Normalization). Each of the
# features is scaled independently with the learned paramaters. So, the number of parameters is the number of 
# features in the last dimension multiplied by 2 (1 parameter for gamma and 1 parameter for beta per feature). Both 
# the output of MultiHeadedAttention and FeedForward neural network have the same size in the last dimension which 
# is 8 (d_model). So, the number of parameters associated with Layer Normalization layer that is applied after 
# MultiHeadedAttention is 8 (gamma) + 8 (beta) = 16. Similarly, the number of parameters associated with Layer
# Normalization layer that is applied after FeedForward neural network is 8 (gamma) + 8 (beta) = 16.
num_encoder_attention_layer_norm_params = d_model + d_model
num_encoder_feed_forward_layer_norm_params = d_model + d_model
# The total number of parameters associated with a single EncoderLayer is sum of the above 8 variables = 600.
num_encoder_layer_params = num_encoder_query_params + num_encoder_key_params + num_encoder_value_params + num_encoder_attention_output_params + num_encoder_feed_forward_linear_layer_1_params + num_encoder_feed_forward_linear_layer_2_params + num_encoder_attention_layer_norm_params + num_encoder_feed_forward_layer_norm_params
# We also apply Layer Normalization to the output of the last EncoderLayer and pass this as the output of the Encoder.
# This layer again has same number of parameters associated with it as other Layer Normalization layers i.e.,
# 8 (gamma) + 8 (beta) = 16
num_encoder_layer_layer_norm_params = d_model + d_model
# The transformer model has 6 EncoderLayers stacked on top of each other. So, the total number of parameters 
# associated with Encoder is (6 * 600) + 16 = 3616
num_total_encoder_params = (num_layers * num_encoder_layer_params) + num_encoder_layer_layer_norm_params
# Now, lets calculate the number of parameters associated with the DecoderLayer and Decoder.
# The method to calculate number of parameters in the DecoderLayer is very similar to how it was done for the 
# EncoderLayer. DecoderLayer just contains 1 additional MultiHeadedAttention Layer (for source attention) and 1
# additional Layer Normalization layer associated with this source attention layer.
# Same as in EncoderLayer ==> Linear Layer parameters ==> 64 (weights) + 8 (bias) = 72
num_decoder_self_attention_query_params = d_model * d_model + d_model
num_decoder_self_attention_key_params = d_model * d_model + d_model
num_decoder_self_attention_value_params = d_model * d_model + d_model
num_decoder_self_attention_output_params = d_model * d_model + d_model
# The next 4 variables correspond to the 1 additional MultiHeadedAttention layer (src attention) present in the 
# DecoderLayer.
# Same as in EncoderLayer ==> Linear Layer parameters ==> 64 (weights) + 8 (bias) = 72
num_decoder_src_attention_query_params = d_model * d_model + d_model
num_decoder_src_attention_key_params = d_model * d_model + d_model
num_decoder_src_attention_value_params = d_model * d_model + d_model
num_decoder_src_attention_output_params = d_model * d_model + d_model
# The FeedForward neural network is exactly the same as in EncoderLayer.
# Same as in EncoderLayer ==> 8 * 16 + 16 = 144
num_decoder_feed_forward_linear_layer_1_params = d_model * d_feed_forward + d_feed_forward
# Same as in EncoderLayer ==> 8 * 16 + 8 = 136
num_decoder_feed_forward_linear_layer_2_params = d_model * d_feed_forward + d_model
# We have 1 additional LayerNormalization layer associated with the source attention. However, its architecture and
# the parameters are the same as in EncoderLayer.
# Same as in EncoderLayer ==> 8 (gamma) + 8 (beta) = 16
num_decoder_self_attention_layer_norm_params = d_model + d_model
num_decoder_src_attention_layer_norm_params = d_model + d_model
num_decoder_feed_forward_layer_norm_params = d_model + d_model
# The total number of parameters associated with a single DecoderLayer is the sum of the above 13 variables = 
num_decoder_layer_params = num_decoder_self_attention_query_params + num_decoder_self_attention_key_params + num_decoder_self_attention_value_params + num_decoder_self_attention_output_params + num_decoder_src_attention_query_params + num_decoder_src_attention_key_params + num_decoder_src_attention_value_params + num_decoder_src_attention_output_params + num_decoder_feed_forward_linear_layer_1_params + num_decoder_feed_forward_linear_layer_2_params + num_decoder_self_attention_layer_norm_params + num_decoder_src_attention_layer_norm_params + num_decoder_feed_forward_layer_norm_params
# We also apply Layer Normalization to the output of the last DecoderLayer and pass this as the output of the Decoder. 
# This layer again has same number of parameters associated with it as other Layer Normalization layers i.e.,
# 8 (gamma) + 8 (beta) = 16
num_decoder_layer_layer_norm_params = d_model + d_model
# The transformer model has 6 EncoderLayers stacked on top of each other. So, the total number of parameters 
# associated with Encoder is (6 * 904) + 16 = 5440
num_total_decoder_params = (num_layers * num_decoder_layer_params) + num_decoder_layer_layer_norm_params
# The output of the decoder is passed to a linear layer that projects the output to the target vocabulary space.
# These parameters are associated with the TokenPredictor layer in the transformer above. The input to the linear
# layer are 8-dimensional vectors (d_model) and output of the linear layers are 6-dimensional vectors (tgt_vocab_size).
# So, the number of parameters associated with the TokenPredictor layer is 8 * 6 (weights) + 6 (bias) = 54
num_vocab_projection_params = d_model * tgt_vocab_size + tgt_vocab_size
# Finally, the total number of parameters in the model is the number of parameters associated with the Embeddings plus
# the number of parameters in the Encoder plus the number of parameters in the Decoder plus the number of parameters
# in the TokenPredictor.
num_total_model_params = num_src_embedding_params + num_tgt_embedding_params + num_total_encoder_params + num_total_decoder_params + num_vocab_projection_params
print("Total Number of parameters associated with the model: ", num_total_model_params)


Total Number of parameters associated with the model:  9206


In [15]:
# So, the number of parameters in the model we built is the same as the number of parameters expected within the model.
# Usually, counting the number of parameters could make several bugs in the code visible and a good check to implement
# for any machine learning model.
assert total_params_with_grad == num_total_model_params