In [1]:
# In this notebook, you learn:
#
# 1) How all the building blocks of the transformer fit together to build a machine translation model?
# 2) Verfiy that the number of parameters within the built model is the same as number of parameters
#    expected within out model architecture ie.,
#           --> assert parameter_count (built_model) == parameter_count (count_manually). 
#
# ------------------------------------------------------------------------------------------------------------
# A LOT OF THE CODE IN THIS NOTEBOOK IS COPIED FROM THE PREVIOUS NOTEBOOKS. I WILL NOT BE EXPLAINING 
# THOSE PARTS IN THIS NOTEBOOK AGAIN.
# ------------------------------------------------------------------------------------------------------------

In [2]:
# Resources to help understand this notebook:
#
# 1) https://nlp.seas.harvard.edu/annotated-transformer/
#       --  The very best resource to understand the transformer implementation.
# 2) http://jalammar.github.io/illustrated-transformer/
#       -- Another great resource to understand the architecture of the transformer model.
# 3) https://www.youtube.com/watch?v=8krd5qKVw-Q
#       -- Gives an intuitive explanation of Xavier Initialization.
# 4) https://www.deeplearning.ai/ai-notes/initialization/index.html
#       -- Gives an in-depth mathematical explanation of Xavier Initialization.

In [3]:
import copy
import math
import torch

from torch import nn, Tensor
from typing import Callable, Optional

<img src="../../Data/Images/TranslationTransformerModel.png" alt="Translation Transformer Model" width="600" height="500">

credits: The above image is taken from this blog post: [https://jalammar.github.io/illustrated-transformer/](https://jalammar.github.io/illustrated-transformer/)

In [4]:
# The transformer model is a stack of N encoder-decoder layers.
# In our implementation, each of the ENCODER boxes shown in the above image is referred to as an 'EncoderLayer' and 
# each of the DECODER boxes is referred to as a 'DecoderLayer'. The stack of 6 ENCODER boxes (EncoderLayers) is 
# referred to as an Encoder and the stack of 6 DECODER boxes (DecoderLayers) is referred to as a Decoder. The entire 
# model is referred to as a Transformer.
#
# The bottom most EncoderLayer receives the tokenized src sentence embeddings and the bottom most DecoderLayer 
# receives the tokenized tgt sentence embeddings. The sentences are tokenized in the Data Preparation phase which is 
# not part of the Transformer model below. The tokens are converted into embeddings, aggregated with the positional 
# encodings and passed to the bottom most EncoderLayer and bottom most DecoderLayer as inputs.
#
# The output of the ENCODER (the last EncoderLayer) is passed to each of the DecoderLayers for src attention 
# calculation. The output of the DECODER (the last DecoderLayer) is passed to a linear layer which projects the 
# Decoder output to the vocab space (target vocab space). This output is then passed to the softmax layer to get the 
# token output probabilities which is the output of the Transformer model as a whole.

<img src="../../Data/Images/EncoderDecoder.png" alt="EncoderDecoder" width="550" height="450">

credits: The above image is taken from this blog post: [https://jalammar.github.io/illustrated-transformer/](https://jalammar.github.io/illustrated-transformer/)

In [5]:
# Constants to be used in the model.
# Size of the embedding vectors in the model. This is 512 in the transformer paper.
d_model = 8
# Number of layers in the Encoder in the Encoder and Decoder. The transformer paper also uses 6. The 
# number of layers could be different in Encoder and Decoder but we use the same value for both.
num_layers = 6
# Probability with which to drop data in the transformer model. We will use the same dropout_prob
# throughout the model.
dropout_prob = 0.1
# Number of attention heads in each of the multi-head attention layers in the model. The transformer
# paper uses 8.
num_heads = 8
# Number of neurons in the hidden layer (that expands the input) in the feed forward neural network.
# The transformer paper uses 2048.
d_feed_forward = 16
# Number of sentences in each batch of data.
batch_size = 2
# Number of tokens in the src vocab. This is the number of unique words in the src language (English).
src_vocab_size = 6
# Number of tokens in the tgt vocab. This is the number of unique words in the tgt language (Telugu).
tgt_vocab_size = 6
# Number of tokens in each of the sentences in the batch. We will pad the sentences to make them all
# of the same length. This is the length of the longest sentence in the batch.
seq_len = 4
# Maximum number of tokens in the sentences among all the batches in the entire dataset.
max_seq_len = 10

In [6]:
# EVERYTHING IN THIS CELL HAS BEEN EXPLAINED IN DETAIL IN THE PREVIOUS NOTEBOOKS. PLEASE REFER TO THE EARLIER
# NOTEBOOKS TO UNDERSTAND THE CODE IN THIS CELL. YOU CAN SKIP (JUST RUN IT BLINDLY) THIS CELL AND MOVE TO THE 
# NEXT CELL DIRECTLY.
# 
# -------------------------------------------------------------------------------------------------------------------
# JUST RUN THIS CELL BLINDLY | JUST RUN THIS CELL BLINDLY | JUST RUN THIS CELL BLINDLY | JUST RUN THIS CELL BLINDLY 
# -------------------------------------------------------------------------------------------------------------------

# Refer to 'step_6_token_embeddings.ipynb' notebook to learn more about the Embeddings class.
class Embeddings(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        """Creates the embedding layer that serves as a look-up table for the tokens in the transformer model.

        Args:
            vocab_size (int): Size of the vocabulary i.e., number of distinct tokens in the vocabulary.
            embedding_dim (int): The size of the embedding vector to be generated for each token.
        """
        super(Embeddings, self).__init__()
        self.look_up_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

    # The input is a '2D' tensor where each '1D' tensor within the '2D' tensor is the list
    # of indices corresponding to the tokens in the vocab.
    # [[0, 123, 3455, 4556, 7, 1, 2, 2], [0, 56, 98, 6234, 909, 56, 1, 2]]
    # 0 - <sos>, 1 - <eos>, 2 - <pad>
    def forward(self, input: Tensor) -> Tensor:
        """Returns the embedding vectors for the corresponding token indices in the input tensor.

        Args:
            input (Tensor): The input tensor containing token indices.
                            shape: [batch_size, seq_len]

        Returns:
            Tensor: The tensor of embedding vectors for the corresponding input token ids.
                    shape: [batch_size, seq_len, embedding_dim]
        """
        # There is no reasoning as to why the original 'attention_is_all_you_need' paper scaled the
        # embeddings using 'math.sqrt(embedding_dim)'. A few blogs attempted to explain this reasoning,
        # but I haven't found any correct explanation with solid reasoning.
        return self.look_up_table(input) * math.sqrt(self.embedding_dim)


# Refer to 'step_8_positional_encoding.ipynb' notebook to learn more about the PositionalEncoding class.
class PositionalEncoding(nn.Module):
    def __init__(self, encoding_size: int, dropout_prob: float, max_len: int=max_seq_len):
        """Creates the positional encodings.

        Args:
            encoding_size (int): Size of the positional encoding vector that represents the position of the token.
            dropout_prob (float): Probability of an element to be zeroed or dropped.
            max_len (int): Largest position for which the positional encoding vector is generated. Defaults to 
                           max_seq_len (10). By default, it generates positional encodings for the first 
                           max_seq_len (10) positions.
        """
        super().__init__()
        # Refer to step_7_drop_out.ipynb notebook (link to the notebook) to understand more about dropout.
        self.dropout = nn.Dropout(p=dropout_prob, inplace=False)
        # Compute the positional encodings in log space.
        positional_encoding = torch.zeros(size=(max_len, encoding_size), dtype=torch.float)
        positional_encoding_numerators = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        numerators_in_exponent = torch.arange(0, encoding_size, 2, dtype=torch.float)
        positional_encoding_denominators = torch.exp(numerators_in_exponent * (-math.log(10000.0) / encoding_size))
        positional_encoding[:, 0::2] = torch.sin(positional_encoding_numerators * positional_encoding_denominators)
        positional_encoding[:, 1::2] = torch.cos(positional_encoding_numerators * positional_encoding_denominators)
        # Refer to understanding_tensor_manipulations_part_1.ipynb notebook (add link to the notebook) to
        # understand more about unsqueeze operation in pytorch.
        # In transformer model, we receive 3D tensors as input to this module. Each 1D tensor
        # in the last dimension is an embedding for the token. Each 2D tensor is a sentence.
        # The entire 3D tensor is a batch of sentences. To work with 3D tensors in the forward
        # method, we convert the positional encoding to a 3D tensor.
        positional_encoding = positional_encoding.unsqueeze(0)
        # Refer to using_modules.ipynb (link to the notebook) to understand more about buffers in pytorch.
        # Essentially, This tells the module to not update the positional encoding tensor during the training. 
        # It is not a trainable parameter but it is still part of the state of the model.
        self.register_buffer('positional_encoding', positional_encoding)
    
    def forward(self, input: Tensor) -> Tensor:
        """Adds the positional encodings to the input tensor.
        Args:
            input (Tensor): The input tensor containing the embeddings of the tokens.
                            shape: [batch_size, sequence_length, d_model]

        Returns:
            Tensor: Input with the positional encodings added to it.
                    shape: [batch_size, sequence_length, d_model]
        """
        # Refer to understanding_tensor_manipulations_part_5.ipynb notebook (add link to the notebook) to 
        # understand more about broadcasting in python.
        # The input tensor is a 3D tensor of shape (batch_size, sequence_length, encoding_size).
        # We add (uses broadcasting) the positional encoding to the input tensor to get the final tensor.
        # positional_encoding: (1, max_len, encoding_size) --> (1, sequence_length, encoding_size) 
        #       -- Extracts the positional encodings for the sequence_length from the positional_encoding 
        #          tensor.
        # (batch_size, sequence_length, encoding_size) --> input
        # (batch_size, sequence_length, encoding_size) --> Resultant positional encoding tensor after broadcasting.
        # requires_grad_(False) is not needed since the positional encoding is already registered
        # as a Buffer and not a trainable parameter. It is just included for clarity.
        input = input + self.positional_encoding[:, :input.size(1)].requires_grad_(False)
        return self.dropout(input)
    

# Creates a copy (deepcopy) of the module and returns ModuleList containing the copies.
def clone_module(module: nn.Module, num_clones: int) -> nn.ModuleList:
    return nn.ModuleList([copy.deepcopy(module) for _ in range(num_clones)])


# Refer to 'step_9_multi_headed_attention.ipynb' notebook to understand how this function works.
def construct_attention_heads(queries: Tensor, 
                              keys: Tensor, 
                              values: Tensor, 
                              mask: Optional[Tensor]=None, 
                              dropout_layer: Optional[nn.Module]=None) -> Tensor:
    """Calculates the attention scores for each token in the sequence with every other token in the sequence.
       Applies the mask if provided and then normalizes the scores using softmax. It then calculates the 
       attention heads for each token in the sequence.

    Args:
        queries (Tensor): [batch_size, num_heads, seq_len, d_k]
        keys (Tensor): [batch_size, num_heads, seq_len, d_k]
        values (Tensor): [batch_size, num_heads, seq_len, d_k]
        mask (Optional[Tensor]): [batch_size, 1, 1, src_seq_len] if the mask is for the source sequences.
                                 [batch_size, 1, tgt_seq_len, tgt_seq_len] if the mask is for the target sequences.
                                 Defaults to None.
        dropout_layer (Optional[nn.Module], optional): probability with which the values are dropped on dropout 
                                                       layer. Defaults to None.

    Returns:
        Tensor: Returns the attention heads.
                SHAPE: [batch_size, num_heads, seq_len, d_k]
    """
    # Size of the vectors for each token for each head in the sequence.
    d_k = queries.shape[-1]
    # Calculate the attention scores for each token in the sequence with every other token in the sequence.
    attention_scores = torch.matmul(queries, keys.transpose(dim0=2, dim1=3)) / math.sqrt(d_k)
    # Mask the attention scores if a mask is provided. Mask is used in two different ways:
    # 1) To prevent the model from attending to the padding tokens --> This applies for both src and tgt sequences.
    # 2) To prevent the model from attending to the future tokens in the sequence --> This applies only for tgt sequences.
    if mask is not None:
        # Please do not set the masked values to float('-inf') as it sometimes (not in everycase) causes softmax to return nan.
        attention_scores = attention_scores.masked_fill(mask == False, float('-1e9'))
    # Normalize the attention scores using softmax.
    attention_scores = attention_scores.softmax(dim=-1)
    # Apply dropout regularization to prevent overfitting problems.
    if dropout_layer is not None:
        attention_scores = dropout_layer(attention_scores)
    # The result of this matrix multiplication is the attention_heads.
    # Calculate the attention heads for each token in the sequence. The head for each token is calculated by
    # taking the weighted average (averaged by attention scores) of the values for all the tokens in the 
    # sequence for the token of interest. 
    return torch.matmul(attention_scores, values)


# Refer to 'step_9_multi_headed_attention.ipynb' notebook to understand how this function works.
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_heads: int, d_model: int, dropout_prob: float=dropout_prob):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads."
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        # We use dropout to prevent overfitting.
        self.dropout_layer = nn.Dropout(p=dropout_prob, inplace=False)
        # Creating the linear layers that generate queries, keys and values for each token in the sequence.
        # Also, creating an additional linear layer to generate the output of the Multi-Headed Attention from concatenated attention heads.
        self.linear_layers = clone_module(module=nn.Linear(in_features=d_model, out_features=d_model), num_clones=4)


    def forward(self, query_input: Tensor, key_input: Tensor, value_input: Tensor, mask: Optional[Tensor]=None) -> Tensor:
        """Forward pass of the Multi-Headed Attention layer. 

        Args:
            query_input (Tensor): Input to be used for query creation.
                                  SHAPE: [batch_size, seq_len, d_model]
            key_input (Tensor): Input to be used for key creation.
                                SHAPE: [batch_size, seq_len, d_model]
            value_input (Tensor): Input to be used for value creation.
                                  SHAPE: [batch_size, seq_len, d_model]
            mask (Tensor): Mask to be applied to the attention scores. Default is None. Same mask will 
                           be applied to all the heads in the Multi-Headed Attention layer.
                           mask: [batch_size, 1, 1, src_seq_len] if the mask is for the source sequences.
                           mask: [batch_size, 1, tgt_seq_len, tgt_seq_len] if the mask is for the target sequences. 
                           Note that src_seq_len and tgt_seq_len are the number of tokens in the source and target sequences
                           respectively and they are likely different.

        Returns:
            Mutli-Headed Attention Output: Output of the Multi-Headed Attention layer. Generates one output vector 
                                           for each token in the sequence. Does this for each sequence in the batch.
                                           SHAPE: [batch_size, seq_len, d_model]
        """
        # Generates the queries, keys and values for each token in the sequence.
        # shape of queries, keys, values: [batch_size, seq_len, d_model]
        queries, keys, values = [linear_layer(input) for linear_layer, input in zip(self.linear_layers, (query_input, key_input, value_input))]
        batch_size = query_input.shape[0]
        # Using '-1' in the view function is to infer the size of the dimension from the original tensor. This is important because
        # the 'seq_len' for the keys, values comes from Encoder output (i.e., src sequences) and the 'seq_len' for the queries comes
        # from decoder input (i.e., tgt sequences) in source attention. The src_sequence size and tgt_sequence size are likely 
        # different and are being handled with common functionality here. So, we need to infer the size of the dimension from the 
        # original tensor instead of harcoding it from the query_input tensor. You can try it by hardcoding the seq_len (instead of setting it to -1) 
        # for keys and values and see the error you get to understand it better (I found out this issue after noticing the errors).
        # This separates the queries, keys and values for each head into a separate vector (thus a 4D tensor). The vectors for each 
        # token in all the heads are concatenated when they are created using the linear_layers above.
        # Shape for queries, keys, values after view: [batch_size, seq_len, num_heads, d_k]
        # Shape for queries, key, values after transpose: [batch_size, num_heads, seq_len, d_k]
        queries, keys, values = [data.view(batch_size, -1, self.num_heads, self.d_k).transpose(dim0=1, dim1=2) for data in (queries, keys, values)]
        # Calculate the attention heads for each token in the sequence.
        # attention_heads: [batch_size, num_heads, seq_len, d_k]
        attention_heads = construct_attention_heads(queries=queries, keys=keys, values=values, mask=mask, dropout_layer=self.dropout_layer)
        # Concatenate the attention heads for each token from all the heads.
        # attention_heads: [batch_size, seq_len, d_model]
        attention_heads = attention_heads.transpose(dim0=1, dim1=2).reshape(batch_size, -1, self.d_model)
        # Generate the output of the Multi-Headed Attention layer.
        return self.linear_layers[-1](attention_heads)
    

# Refer to 'step_10_feed_forward_neural_network.ipynb' notebook to understand how this class works.
class FeedForwardNN(nn.Module):
    def __init__(self, d_model: int, d_feed_forward: int, dropout_prob: float = dropout_prob):
        super().__init__()
        self.linear_layer_1 = nn.Linear(in_features=d_model, out_features=d_feed_forward)
        self.linear_layer_2 = nn.Linear(in_features=d_feed_forward, out_features=d_model)
        self.dropout_layer = nn.Dropout(p=dropout_prob, inplace=False)

    def forward(self, input: Tensor) -> Tensor:
        """Passes the input through the Feed Forward Neural Network and returns the output 
           of the neural network.

        Args:
            input (Tensor): The output of the Multi-Headed Attention layer.
                            shape: [batch_size, seq_len, d_model]

        Returns:
            Tensor: The output of the Feed Forward Neural Network.
                    shape: [batch_size, seq_len, d_model]
        """
        # We first expand the input to higher dimension. We apply the ReLU activation function in this layer.
        intermediate_output = self.linear_layer_1(input).relu()
        # Dropout layer to prevent overfitting
        intermediate_output = self.dropout_layer(intermediate_output)
        # We then compress the input back to its original dimension. There is no specific intuitive explanation 
        # as to why this is done. It is just shown to be working practically in neural networks in general and 
        # in this paper in particular.
        return self.linear_layer_2(intermediate_output)
    

# Refer to 'step_12_encoder.ipynb' to understand how this class works.
class SubLayerWrapper(nn.Module):
    def __init__(self, d_model: int, dropout_prob: float):
        """This class is a wrapper around the MultiHeadedAttention and PositionwiseFeedForward classes.

        Args:
            d_model (int): Dimension of the vectors used in the Attention model.
            dropout_prob (float): probability with which nodes can be dropped.
        """
        super().__init__()
        self.dropout = nn.Dropout(dropout_prob, inplace=False)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, input: Tensor, sublayer: Callable[[Tensor], Tensor]) -> Tensor:
        """It applies the operation on the input, applies dropout, adds the input back to the transformed 
           input, does normalization and returns the output.

        Args:
            input (Tensor): Input to be transformer by the sublayer.
                            shape: [batch_size, seq_len, d_model]
            sublayer (Callable): sublayer is a callable that takes a tensor as input and returns a tensor 
                                 as output. Could be either a lambda function that calls MultiHeadedAttention 
                                 or a direct nn.Module which is PositionwiseFeedForward in this case.
            
        Returns:
            Tensor: Output of the sublayer transformation.
                    shape: [batch_size, seq_len, d_model]
        """
        return input + self.dropout(sublayer(self.layer_norm(input)))


# Refer to 'step_12_encoder.ipynb' to understand how this class works.
class EncoderLayer(nn.Module):
    def __init__(self, 
                 self_attention: MultiHeadedAttention, 
                 feed_forward: FeedForwardNN, 
                 d_model: int, 
                 dropout_prob: float):
        super().__init__()
        self.d_model = d_model
        self.dropout_prob = dropout_prob
        # These modules are now the child modules of the EncoderLayer and will be registered as parameters of the EncoderLayer.
        self.self_attention = self_attention
        self.feed_forward = feed_forward
        # We need two instances of the SubLayerWrapper class. One for the self_attention and the other for the feed_forward.
        self.sublayer_wrappers = clone_module(module=SubLayerWrapper(d_model=self.d_model, dropout_prob=self.dropout_prob), num_clones=2)

    def forward(self, input: Tensor, mask: Tensor) -> Tensor:
        """This method is the forward pass of the EncoderLayer class.

        Args:
            input (Tensor): Source sequences provided as input to the EncoderLayer. These are the embeddings of the source 
                            sequences for the first EncoderLayer.
                            SHAPE: [batch_size, src_seq_len, d_model]
            mask (Tensor): Boolean mask to be applied to the input during attention scores calculation.
                           SHAPE: [batch_size, 1, 1, src_seq_len]
        Returns:
            Tensor: Output of the EncoderLayer.
                    SHAPE: [batch_size, src_seq_len, d_model]
        """
        # We are just saving the function call to the self_attention method in a variable and passing the
        # lambda function (contained within the variable) to the sublayer_wrappers[0] to execute it when 
        # needed.
        output = self.sublayer_wrappers[0](input, lambda input: self.self_attention(query_input=input, key_input=input, value_input=input, mask=mask))
        return self.sublayer_wrappers[1](output, self.feed_forward)
    

# Refer to 'step_12_encoder.ipynb' to understand how this class works.
class Encoder(nn.Module):
    def __init__(self, encoder_layer: EncoderLayer, num_layers: int):
        super().__init__()
        self.encoder_layers = clone_module(module=encoder_layer, num_clones=num_layers)
        self.layer_norm = nn.LayerNorm(encoder_layer.d_model)

    def forward(self, input: Tensor, mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the Encoder class. The output of the current EncoderLayer is
           passed as input to the next EncoderLayer. We have 6 identical EncoderLayers stacked on top of 
           each other. The output of the last EncoderLayer is passed through a Layer Normalization layer
           and returned as the final output of the Encoder

        Args:
            input (Tensor): Input to the Encoder i.e., embeddings of the tokenized src sequences.
                            input: [batch_size, src_seq_len, d_model]
            mask (Optional[Tensor], optional): Boolean mask to be applied during attention scores calculation.
                                               mask: [batch_size, 1, 1, src_seq_len]. Defaults to None.
                            
        Returns:
            Tensor: Output of the Encoder i.e., encoded src sequences.
                    output: [batch_size, src_seq_len, d_model]
        """
        output = input
        for encoder_layer in self.encoder_layers:
            # Pass the output of the previous EncoderLayer to the current EncoderLayer.
            output = encoder_layer(input=output, mask=mask)
        return self.layer_norm(output)


# Refer to 'step_13_decoder.ipynb' to understand how this class works.
class DecoderLayer(nn.Module):
    def __init__(self, 
                 self_attention: MultiHeadedAttention, 
                 src_attention: MultiHeadedAttention, 
                 feed_forward: FeedForwardNN, 
                 d_model: int, 
                 dropout_prob: float):
        super().__init__()
        self.d_model = d_model
        self.dropout_prob = dropout_prob
        # These modules are now the child modules of the DecoderLayer and will be registered as parameters of the DecoderLayer.
        self.self_attention = self_attention
        self.src_attention = src_attention
        self.feed_forward = feed_forward
        self.sublayer_wrappers = clone_module(module=SubLayerWrapper(d_model=d_model, dropout_prob=dropout_prob), num_clones=3)

    def forward(self, input: Tensor, encoded_src: Tensor, tgt_mask: Tensor, src_mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the DecoderLayer class.

        Args:
            input (Tensor): Target sequences provided as input to the DecoderLayer. These are the embeddings of the target 
                            sequences for the first DecoderLayer.
                            SHAPE: [batch_size, tgt_seq_len, d_model]
            encoded_src (Tensor): Encoded source sequences. This is the output of the Encoder. This is used to calculate the
                                  source attention scores for the target sequences. 
                                  SHAPE: [batch_size, seq_len, d_model] 
            tgt_mask (Tensor): Mask to prevent the future tokens in the target sequences to attend to the previous tokens and
                               also to prevent padding tokens from attending to any other token except other padding tokens.
                               SHAPE: [batch_size, 1, tgt_seq_len, tgt_seq_len]
            src_mask (Tensor, optional): Mask to prevent the the padding tokens to attend to the tokens in the tgt sentence. 
                                         Defaults to None.
                                         SHAPE: [batch_size, 1, 1, src_seq_len]

        Returns:
            Tensor: Returns the output of the DecoderLayer. This is the output of the Positionwise FeedForward Neural Network.
                    SHAPE: [batch_size, tgt_seq_len, d_model]
        """
        # First sublayer: Self-Attention on the target sentence. Hence, it uses the tgt_mask.
        self_attention_output = self.sublayer_wrappers[0](input=input, sublayer=lambda input: self.self_attention(query_input=input, key_input=input, value_input=input, mask=tgt_mask)) 
        # To give intuition about src_attention, I have a query for a token in the target sequence. I want to know whether 
        # some token in the source sequence is important for me to predict the output for this token in the target sequence. 
        # So, I go to the source sequence and get the values for all the tokens in the source sequence. I then calculate 
        # the attention scores between the query (in tgt) and the keys (in src). I then calculate the attention heads for 
        # the token in the target sequence using the attention scores. This is what is done in the below line. Note that 
        # referring to statement 'the keys and values are from the source' doesn't mean that you get keys and values 
        # explicitly. It means we use the encoded data from the source sequences to calculate the queries and keys for 
        # this transformation.
        # Second sublayer: Attention on the source sequences. Hence, it uses the src_mask.
        src_attention_output = self.sublayer_wrappers[1](input=self_attention_output, sublayer=lambda self_attention_output: self.src_attention(query_input=self_attention_output, key_input=encoded_src, value_input=encoded_src, mask=src_mask))
        # Third sublayer: Positionwise FeedForward Neural Network.
        return self.sublayer_wrappers[2](input=src_attention_output, sublayer=self.feed_forward)
    

# Refer to 'step_13_decoder.ipynb' to understand how this class works.
class Decoder(nn.Module):
    def __init__(self, decoder_layer: DecoderLayer, num_layers: int):
        super().__init__()
        self.decoder_layers = clone_module(module=decoder_layer, num_clones=num_layers)
        self.layer_norm = nn.LayerNorm(decoder_layer.d_model)

    def forward(self, input: Tensor, encoded_src: Tensor, tgt_mask: Tensor, src_mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the Decoder class. The output of the current DecoderLayer is
           passed as input to the next DecoderLayer. We have 6 identical DecoderLayers stacked on top of 
           each other. The output of the Encoder (last EncoderLayer) is also passed as input to the 
           first DecoderLayer. The output of the last DecoderLayer is passed through a Layer Normalization 
           layer and returned as the final output of the Decoder.

        Args:
            input (Tensor): Input to the Decoder i.e., embeddings of the tokenized tgt sequences.
                            SHAPE: [batch_size, tgt_seq_len, d_model]
            encoded_src (Tensor): output of the encoder i.e., encoded src sequences.
                                  SHAPE: [batch_size, src_seq_len, d_model]
            tgt_mask (Tensor): Boolean mask to be applied during self attention scores calculation.
                               SHAPE: [batch_size, 1, tgt_seq_len, tgt_seq_len].
            src_mask (Tensor, optional): Boolean mask to be applied during src attention scores calculation.
                                         SHAPE: [batch_size, 1, 1, src_seq_len]. Defaults to None.

        Returns:
            Tensor: Output of the Decoder.
                    SHAPE: [batch_size, tgt_seq_len, d_model]
        """
        output = input
        for decoder_layer in self.decoder_layers:
            # Pass the output of the previous DecoderLayer to the current DecoderLayer.
            output = decoder_layer(input=output, encoded_src=encoded_src, tgt_mask=tgt_mask, src_mask=src_mask)
        return self.layer_norm(output)


# Refer to 'step_14_token_predictor.ipynb' to understand how this class works.
class TokenPredictor(nn.Module):
    def __init__(self, d_model: int, tgt_vocab_size: int):
        super(TokenPredictor, self).__init__()
        self.d_model = d_model
        self.vocab_size = tgt_vocab_size
        self.linear = nn.Linear(in_features=d_model, out_features=tgt_vocab_size)
        # The non-module variables are not added to the list of parameters of the model.
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, decoder_output: Tensor) -> Tensor:
        """The forward pass of the token predictor. Calculates the probability distribution over the 
           vocabulary. Each token vector has a corresponding probability distribution over the 
           vocabulary since we predict one token per output.

        Args:
            decoder_output (Tensor): Output of the Decoder.
                                     SHAPE: [batch_size, tgt_seq_len - 1, d_model]

        Returns:
            Tensor: Log probability distribution over the vocabulary. 
                    SHAPE: [batch_size, tgt_seq_len - 1, vocab_size]
        """
        # Project the decoder output to the vocab_size dimensional space.
        logits = self.linear(decoder_output)
        # Convert the logits to a probability distribution over the vocabulary. All the entries in the
        # output tensor are negative since we are using log softmax. The log softmax is used to make
        # the training more numerically stable. However, the maximum value in log_softmax is still the 
        # same as the maximum value of the general softmax output.
        return self.log_softmax(logits)
    
# -------------------------------------------------------------------------------------------------------------------
# CELL CONTAINING THE COPIED CODE FROM PREVIOUS NOTEBOOKS ENDS HERE.
# CELL CONTAINING THE COPIED CODE FROM PREVIOUS NOTEBOOKS ENDS HERE.
# CELL CONTAINING THE COPIED CODE FROM PREVIOUS NOTEBOOKS ENDS HERE.
# -------------------------------------------------------------------------------------------------------------------

## The Transformer model i.e., learning material specific to this notebook starts from here.

The input to the Machine Translation model is source tensors, source masks, target tensors, and target masks. <br>
The output of the Machine Translation model is the a bunch of tensors where each tensor corresponds to the <br>
probability distribution over the target vocabulary. These bunch of tensors will be used to predict the exact <br>
tokens in later steps and output a readable sentence in the target language. <br>

To build a Machine translation model, we need to instantiate all of the modules we have created in earlier <br>
notebooks. This instantiation should be done in the initialization function of the machine translation module. <br>
The input need to be passed through each of these modules appropriately and the output of the last module is <br>
the output of the machine translation model. This is done in the forward function of the module.

In [7]:
# Lets go step by step and understand what all modules are necesary:
# Step 1:
#   -- We need embedding vectors for the source and target vocabulary. We made a choice to keep the tokens
#      separate for both languages.
src_embedding = Embeddings(vocab_size=src_vocab_size, embedding_dim=d_model)
tgt_embedding = Embeddings(vocab_size=tgt_vocab_size, embedding_dim=d_model)

# Step 2:
#   -- We need positional encoding module to find the tensors corresponding to each token position and add
#      it to the embedding vectors.
src_positional_encoding = PositionalEncoding(encoding_size=d_model, dropout_prob=dropout_prob, max_len=max_seq_len)
tgt_positional_encoding = PositionalEncoding(encoding_size=d_model, dropout_prob=dropout_prob, max_len=max_seq_len)

# Step 3:
#   -- We need Multi Headed Attention module to pass the token vectors through the attention layer in Encoder
#      and Decoder separately. We create 1 multi headed attention module and copy it as many times as 
#      necessary and pass it to Encoder and Decoder accordingly.
multi_headed_attention = MultiHeadedAttention(num_heads=num_heads, d_model=d_model, dropout_prob=dropout_prob)

# Step 4:
#   -- We need Feed Forward module to pass the token vectors through the Feedforward layer that expands and 
#      compresses the vectors sizes. We again create 1 feed forward module and copy it as many times as 
#      necessary and pass it to the Encoder and Decoder accordingly.
feed_forward_nn = FeedForwardNN(d_model=d_model, d_feed_forward=d_feed_forward, dropout_prob=dropout_prob)

# Step 5:
#   -- We need the Encoder to encode the source input tokens and use it for decoding in the Decoder.
encoder_layer = EncoderLayer(self_attention=copy.deepcopy(multi_headed_attention), 
                             feed_forward=copy.deepcopy(feed_forward_nn), 
                             d_model=d_model, 
                             dropout_prob=dropout_prob)

# Step 6:
#   -- We need the Decoder that behaves differently during training and inference time.
#   -- Training time: Encode the target tokens and predict the next token for every current token in the input.
#   -- Testing time: Predict the target tokens using the already predicted tokens.
decoder_layer = DecoderLayer(self_attention=copy.deepcopy(multi_headed_attention), 
                             src_attention=copy.deepcopy(multi_headed_attention),
                             feed_forward=copy.deepcopy(feed_forward_nn), 
                             d_model=d_model, 
                             dropout_prob=dropout_prob)

# Step 7:
#   -- We need to convert the output token vectors from the last Decoder to the probability distributions over
#      the target vocabulary. We need token predictor module to be able to do this.
token_predictor = TokenPredictor(d_model=d_model, tgt_vocab_size=tgt_vocab_size)

#### Xavier Initialization

We instantiated all the module necessary for the machine translation model. When the modules are initialized, all <br>
the learnable parameters are initialized to random values. This random initialization makes the training slower and <br>
harder to converge to a local minimum during training. Initializing the parameters in a specific way is helps us <br>
avoid these issues. We use Xavier initialization in this model. To do this we iterate over all the learnable <br>
parameters of a module and initialize them using Xavier initialization.

Lets generate some random source, and target data to experiment

In [8]:
# THIS CELL CONTAINS SOME HELPER FUNCTIONS TO GENERATE RANDOM DATA FOR TESTING THE MODEL.

# Generates the (src, tgt) data and prints them for visibility.
def PrintTensor(input: Tensor, name: str):
    print("name: ", name)
    print("shape: ", input.shape)
    print(input)
    print("-" * 150)

# The true data will have a specific format that is expected by the model. That part is not considered
# in this random generation. The random generation is just to test the model and not to train it.
def generate_batch_of_input_data(batch_size: int, seq_len: int, vocab_size: int) -> Tensor:
    return torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len))

# The true data will have masks that are corelated with the data. That part is not considered in this
# random generation. The random generation of masks is just to test the model and not to train it.
def construct_target_mask(batch_size: int, seq_len: int) -> Tensor:
    # If some index is set to False, then it will be masked out.
    mask = torch.randn(size=(batch_size, 1, seq_len, seq_len)) > 0.5
    return mask.bool()

# The true data will have masks that are corelated with the data. That part is not considered in this
# random generation. The random generation of masks is just to test the model and not to train it.
def construct_source_mask(batch_size: int, seq_len: int) -> Tensor:
    # If some index is set to False, then it will be masked out.
    mask = torch.randn(size=(batch_size, 1, 1, seq_len)) > 0.5
    return mask.bool()

# Please not that these are randomly generated data and do not consider the restrictions that are present
# in the actual data. This is just for experimentation.
src = generate_batch_of_input_data(batch_size=batch_size, seq_len=seq_len, vocab_size=src_vocab_size)
PrintTensor(input=src, name="src")
src_mask = construct_source_mask(batch_size=batch_size, seq_len=seq_len)
PrintTensor(input=src_mask, name="src_mask")
tgt = generate_batch_of_input_data(batch_size=batch_size, seq_len=seq_len, vocab_size=tgt_vocab_size)
PrintTensor(input=tgt, name="tgt")
tgt_mask = construct_target_mask(batch_size=batch_size, seq_len=seq_len)
PrintTensor(input=tgt_mask, name="tgt_mask")

name:  src
shape:  torch.Size([2, 4])
tensor([[2, 0, 0, 0],
        [2, 1, 2, 0]])
------------------------------------------------------------------------------------------------------------------------------------------------------
name:  src_mask
shape:  torch.Size([2, 1, 1, 4])
tensor([[[[False, False,  True, False]]],


        [[[False,  True, False, False]]]])
------------------------------------------------------------------------------------------------------------------------------------------------------
name:  tgt
shape:  torch.Size([2, 4])
tensor([[0, 5, 3, 4],
        [1, 5, 0, 4]])
------------------------------------------------------------------------------------------------------------------------------------------------------
name:  tgt_mask
shape:  torch.Size([2, 1, 4, 4])
tensor([[[[ True, False,  True,  True],
          [False, False, False, False],
          [ True, False, False,  True],
          [False, False,  True,  True]]],


        [[[ True, False, False, 

### Forward function logic of the Machine Translation model.

#### Encoder Logic

In [9]:
# Retrieving the embedding vectors for the tokens in the source tensors.
src_embedding_vectors = src_embedding(src)
print(f"shape of src_embedding_vectors: {src_embedding_vectors.shape}")
print(f"src_embedding_vectors: {src_embedding_vectors}")

shape of src_embedding_vectors: torch.Size([2, 4, 8])
src_embedding_vectors: tensor([[[-1.0407, -2.4498, -2.0050, -0.4954,  0.8531, -1.1112, -3.4147,
          -1.7112],
         [-4.3891, -1.1112,  1.3253,  0.2707,  3.3873, -5.7342,  1.6317,
          -1.5066],
         [-4.3891, -1.1112,  1.3253,  0.2707,  3.3873, -5.7342,  1.6317,
          -1.5066],
         [-4.3891, -1.1112,  1.3253,  0.2707,  3.3873, -5.7342,  1.6317,
          -1.5066]],

        [[-1.0407, -2.4498, -2.0050, -0.4954,  0.8531, -1.1112, -3.4147,
          -1.7112],
         [-2.7075, -1.1179,  0.4477,  0.0810,  4.1930, -3.6844, -0.5976,
          -2.0886],
         [-1.0407, -2.4498, -2.0050, -0.4954,  0.8531, -1.1112, -3.4147,
          -1.7112],
         [-4.3891, -1.1112,  1.3253,  0.2707,  3.3873, -5.7342,  1.6317,
          -1.5066]]], grad_fn=<MulBackward0>)


In [10]:
# Add position information to the source embedding vectors.
src_embedding_vectors_with_position = src_positional_encoding(src_embedding_vectors)
print(f"shape of src_embedding_vectors_with_position: {src_embedding_vectors_with_position.shape}")
print("src_embedding_vectors_with_position: ", src_embedding_vectors_with_position)

shape of src_embedding_vectors_with_position: torch.Size([2, 4, 8])
src_embedding_vectors_with_position:  tensor([[[-1.1563, -1.6109, -0.0000,  0.5607,  0.9479, -0.1235, -3.7941,
          -0.7902],
         [-3.9418, -0.6343,  0.0000,  1.4063,  3.7748, -5.2603,  1.8141,
          -0.0000],
         [-3.8665, -1.6970,  1.6933,  1.3897,  3.7859, -5.2605,  1.8152,
          -0.5628],
         [-4.7200, -2.3346,  1.8009,  1.3622,  3.7970, -5.2608,  1.8163,
          -0.5628]],

        [[-1.1563, -1.6109, -2.2278,  0.5607,  0.0000, -0.1235, -3.7941,
          -0.7902],
         [-2.0733, -0.6418,  0.6084,  1.1956,  4.6700, -2.9827, -0.6628,
          -1.2096],
         [-0.1460, -3.1844, -2.0070,  0.5385,  0.9701, -0.1238, -3.7919,
          -0.7902],
         [-4.7200, -2.3346,  1.8009,  1.3622,  3.7970, -0.0000,  1.8163,
          -0.0000]]], grad_fn=<MulBackward0>)


In [11]:
# Now, pass the source embedding vectors through the Encoder to get the encoded source sequences.
encoded_src = encoder_layer(input=src_embedding_vectors_with_position, mask=src_mask)
print(f"shape of encoded_src: {encoded_src.shape}")
print(f"encoded_src: {encoded_src}")

shape of encoded_src: torch.Size([2, 4, 8])
encoded_src: tensor([[[-1.0788e+00, -2.4987e+00,  2.4476e-01,  2.3449e-01,  1.7933e+00,
           5.0035e-03, -3.9426e+00, -1.3105e+00],
         [-4.2757e+00, -1.3240e+00,  3.2981e-01,  1.3144e+00,  4.4227e+00,
          -5.3738e+00,  1.4156e+00, -5.6603e-01],
         [-4.0959e+00, -2.2578e+00,  2.1010e+00,  1.3444e+00,  4.3172e+00,
          -5.3181e+00,  1.6782e+00, -1.2229e+00],
         [-5.0831e+00, -3.0675e+00,  2.2088e+00,  1.2976e+00,  4.4506e+00,
          -5.2016e+00,  1.4037e+00, -1.1183e+00]],

        [[-1.0684e+00, -2.2803e+00, -2.5831e+00,  2.8138e-01,  3.1579e-01,
           2.1795e-01, -4.6701e+00, -1.4581e+00],
         [-1.9891e+00, -1.4342e+00,  4.1396e-01,  1.0769e+00,  4.9344e+00,
          -2.5030e+00, -1.0236e+00, -1.5172e+00],
         [-6.0431e-03, -3.9125e+00, -1.9845e+00,  5.1564e-01,  1.0958e+00,
          -7.5007e-02, -3.9750e+00, -1.5933e+00],
         [-4.7928e+00, -3.0232e+00,  1.8152e+00,  1.3983e+00,  4.1

#### Decoder Logic

In [12]:
# Retrieving the embedding vectors for the target tokens.
tgt_embedding_vectors = tgt_embedding(tgt)
print(f"shape of tgt_embedding_vectors: {tgt_embedding_vectors.shape}")
print(f"tgt_embedding_vectors: {tgt_embedding_vectors}")

shape of tgt_embedding_vectors: torch.Size([2, 4, 8])
tgt_embedding_vectors: tensor([[[-4.8951, -0.2630,  3.4337,  0.1546, -1.7134,  6.4791,  7.0275,
          -1.8557],
         [ 4.0292, -0.3557, -3.2353,  4.4224, -1.0516,  2.3604,  0.1306,
          -0.3515],
         [-1.0533, -2.1132, -5.1767,  5.0605, -0.5604, -4.0133, -0.7780,
           4.1544],
         [ 1.8254,  4.6595,  1.5401,  3.3508,  1.2806, -2.0570,  1.5998,
           2.9175]],

        [[-2.2948, -3.6065,  0.2519, -1.1297, -2.9740,  3.1283,  2.7748,
          -3.7646],
         [ 4.0292, -0.3557, -3.2353,  4.4224, -1.0516,  2.3604,  0.1306,
          -0.3515],
         [-4.8951, -0.2630,  3.4337,  0.1546, -1.7134,  6.4791,  7.0275,
          -1.8557],
         [ 1.8254,  4.6595,  1.5401,  3.3508,  1.2806, -2.0570,  1.5998,
           2.9175]]], grad_fn=<MulBackward0>)


In [13]:
# Add position information to the target embedding vectors.
tgt_embedding_vectors_with_position = tgt_positional_encoding(tgt_embedding_vectors)
print(f"shape of tgt_embedding_vectors_with_position: {tgt_embedding_vectors_with_position.shape}")
print(f"tgt_embedding_vectors_with_position: {tgt_embedding_vectors_with_position}")

shape of tgt_embedding_vectors_with_position: torch.Size([2, 4, 8])
tgt_embedding_vectors_with_position: tensor([[[-5.4390,  0.0000,  3.8153,  1.2829, -1.9037,  8.3101,  7.8083,
          -0.9507],
         [ 5.4119,  0.2052, -3.4839,  6.0193, -0.0000,  3.7337,  0.1463,
           0.7205],
         [-0.1600, -2.8104, -5.5311,  6.7118, -0.6004, -3.3483, -0.8622,
           5.7271],
         [ 0.0000,  4.0772,  2.0396,  4.7845,  1.4562, -1.1750,  1.7809,
           4.3527]],

        [[-2.5498, -2.8962,  0.2799, -0.1441, -3.3044,  4.5870,  3.0831,
          -3.0718],
         [ 5.4119,  0.2052, -3.4839,  6.0193, -1.1573,  3.7337,  0.1463,
           0.7205],
         [-4.4286, -0.7546,  4.0360,  1.2608, -1.8815,  0.0000,  7.8105,
          -0.9507],
         [ 2.1850,  4.0772,  0.0000,  4.7845,  1.4562, -1.1750,  1.7809,
           4.3527]]], grad_fn=<MulBackward0>)


In [14]:
# Pass the target embedding vectors through the Decoder to get the output of the Decoder.
decoder_output = decoder_layer(input=tgt_embedding_vectors_with_position, encoded_src=encoded_src, tgt_mask=tgt_mask, src_mask=src_mask)
print(f"shape of decoder_output: {decoder_output.shape}")
print(f"decoder_output: {decoder_output}")

shape of decoder_output: torch.Size([2, 4, 8])
decoder_output: tensor([[[-6.8903, -1.3032,  3.6175,  0.8016, -0.4512,  7.4971,  7.2742,
          -1.8167],
         [ 4.3756, -1.7061, -3.8398,  5.0032,  1.0308,  3.3971,  0.0600,
          -0.0650],
         [-0.8664, -5.0414, -5.2303,  5.9490,  0.1336, -4.6500, -2.4436,
           5.2300],
         [-0.8941,  1.9210,  1.9445,  4.3427,  1.8343, -2.5898, -0.1082,
           3.7365]],

        [[-2.9524, -3.6451, -0.0892, -0.2469, -3.8061,  3.0214,  1.5248,
          -3.0172],
         [ 4.3414, -0.2938, -4.1295,  5.2066, -1.5429,  2.8994, -0.8386,
           0.2133],
         [-4.8313, -2.0372,  3.7531,  1.7165, -2.4956, -1.3718,  6.2177,
          -1.5337],
         [ 2.2659,  3.4722, -0.6654,  4.2460,  2.0903, -1.9015,  1.5401,
           3.8531]]], grad_fn=<AddBackward0>)


#### Probability Distribution Prediction Logic

In [15]:
target_probability_distributions = token_predictor(decoder_output)
print(f"shape of target_probability_distributions: {target_probability_distributions.shape}")
print(f"target_probability_distributions: {target_probability_distributions}")

shape of target_probability_distributions: torch.Size([2, 4, 6])
target_probability_distributions: tensor([[[-10.7180,  -4.4458,  -5.5887, -10.0208,  -3.8439,  -0.0376],
         [ -2.4640,  -1.3492,  -2.1765,  -0.6614,  -4.6119,  -4.1387],
         [ -1.5124,  -6.6308,  -2.7857,  -0.3349,  -7.4957,  -7.3191],
         [ -3.7088,  -2.0425,  -1.4831,  -0.6633,  -2.7351,  -3.2484]],

        [[ -3.0387,  -3.7377,  -2.1507,  -4.2689,  -2.2149,  -0.3729],
         [ -3.4303,  -2.7862,  -2.3919,  -0.2166,  -5.6044,  -5.1828],
         [ -5.1930,  -4.8207,  -2.5420,  -6.9290,  -1.0418,  -0.5908],
         [ -3.6386,  -1.8577,  -2.1418,  -0.4184,  -3.3989,  -4.7402]]],
       grad_fn=<LogSoftmaxBackward0>)


### Now lets combine all the code from above into a class.

In [16]:
# We will now create the Transformer model by logically combining all the components we have created so far.
class MachineTranslationModel(nn.Module):
    """Model that combines the Encoder, Decoder and the TokenPredictor to create a machine translation Transformer model."""

    def __init__(self, 
                 d_model: int, 
                 d_feed_forward: int, 
                 dropout_prob: float, 
                 num_heads: int, 
                 src_vocab_size: int, 
                 tgt_vocab_size: int, 
                 num_layers: int, 
                 max_seq_len: int):
        """Initializes the Transformer model.

        Args:
            d_model (int): size of the embedding vectors in the model.
            d_feed_forward (int): Number of neurons in the hidden layer of the feed forward neural network.
            dropout_prob (float): probability with which to drop data for regularization in the transformer model.
            num_heads (int): number of attention heads in each of the multi-head attention layers in the model.
            src_vocab_size (int): size of the source vocabulary.
            tgt_vocab_size (int): size of the target vocabulary.
            num_layers (int): number of layers in the Encoder and Decoder.
            max_seq_len (int): Maximum length of the sequence that is ever input to the model.
        """
        super(MachineTranslationModel, self).__init__()
        self.src_embedding = Embeddings(vocab_size=src_vocab_size, embedding_dim=d_model)
        self.tgt_embedding = Embeddings(vocab_size=tgt_vocab_size, embedding_dim=d_model)
        # We have to create two instances of the PositionalEncoding since PositionalEncoding module has a Dropout layer
        # and is applied independently in both the cases.
        self.src_positional_encoding = PositionalEncoding(encoding_size=d_model, dropout_prob=dropout_prob, max_len=max_seq_len)
        self.tgt_positional_encoding = PositionalEncoding(encoding_size=d_model, dropout_prob=dropout_prob, max_len=max_seq_len)
        # Note that multi_headed_attention, feed_forward_nn, encoder_layer and decoder_layer are not child modules of
        # the MachineTranslationModel class. They are just variables that are used to create the child modules of the
        # MachineTranslationModel class.
        multi_headed_attention = MultiHeadedAttention(num_heads=num_heads, d_model=d_model, dropout_prob=dropout_prob)
        feed_forward_nn = FeedForwardNN(d_model=d_model, d_feed_forward=d_feed_forward, dropout_prob=dropout_prob)
        encoder_layer = EncoderLayer(self_attention=copy.deepcopy(multi_headed_attention), 
                                     feed_forward=copy.deepcopy(feed_forward_nn), 
                                     d_model=d_model, 
                                     dropout_prob=dropout_prob)
        decoder_layer = DecoderLayer(self_attention=copy.deepcopy(multi_headed_attention), 
                                     src_attention=copy.deepcopy(multi_headed_attention),
                                     feed_forward=copy.deepcopy(feed_forward_nn), 
                                     d_model=d_model, 
                                     dropout_prob=dropout_prob)
        # encoder, decoder and token_predictor are the child modules of the MachineTranslationModel class.
        self.encoder = Encoder(encoder_layer=encoder_layer, num_layers=num_layers)
        self.decoder = Decoder(decoder_layer=decoder_layer, num_layers=num_layers)
        self.token_predictor = TokenPredictor(d_model=d_model, tgt_vocab_size=tgt_vocab_size)
        self.initialize_model_parameters()

    def initialize_model_parameters(self):
        """Initializes the parameters of the model using the Xavier Uniform initialization."""
        for params in self.parameters():
            # This is to ensure the only the weights are initialized and not the biases. biases usually have only
            # one dimension and the weights have more than one dimension. biases are usually initialized to zero.
            if params.dim() > 1:
                nn.init.xavier_uniform_(params)

    def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor, tgt_mask: Tensor) -> Tensor:
        """The forward pass of the Transformer model. The source sentences are passed through the Encoder and the target
           sentences are passed through the Decoder. The output of the Decoder is passed through the token predictor to
           get the probability distribution over the target vocabulary.

        Args:
            src (Tensor): Source sequences (English) containing the token ids corresponding to the indices in the src vocabulary. 
                          Example input looks like [[0, 4, 55, 67, 1, 2, 2], [0, 42, 585, 967, 19, 26, 1]]
                          SHAPE: [batch_size, src_seq_len]
            tgt (Tensor): Target sequences (Telugu) containing the token ids corresponding to the indices in the tgt vocabulary. 
                          Example input looks like [[0, 3, 5, 677, 81, 1, 2], [0, 7, 67, 190, 3245, 1]]
                          SHAPE: [batch_size, tgt_seq_len]
            src_mask (Tensor): Mask to be applied to the source sequences in each of the attention heads.
                               src_mask: [batch_size, 1, src_seq_len, src_seq_len]
            tgt_mask (Tensor): Mask to be applied to the target sequences in each of the attention heads.
                               tgt_mask: [batch_size, 1, tgt_seq_len - 1, tgt_seq_len - 1]

        Returns:
            Tensor: Log probability distribution over the tokens in the target vocabulary (Telugu vocabulary).
                    SHAPE: [batch_size, tgt_seq_len - 1, tgt_vocab_size]
        """
        # Pass the source sentences through the encoder to get the encoded source token vectors.
        encoded_src = self.encode(src=src, src_mask=src_mask)
        # Pass the target sentence through the decoder to get the encoded target token vectors.
        decoded_tgt = self.decode(tgt=tgt, tgt_mask=tgt_mask, encoded_src=encoded_src, src_mask=src_mask)
        return self.generate_tgt_token_prob_distributions(decoded_tgt=decoded_tgt)

    def encode(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """Encodes the source sentences (English).

        Args:
            src (Tensor): A batch of source sequences containing the token ids corresponding to the indices in the src vocabulary.
                          SHAPE: [batch_size, src_seq_len]
            src_mask (Tensor): Mask to be applied to the source sequences in each of the attention heads. Same mask will be 
                               applied to the sequence in all the attention heads.
                               SHAPE: [batch_size, 1, 1, src_seq_len]

        Returns:
            Tensor: Encoded source sequences. Each token in the source sequence is represented by a vector that encodes
                    all the information about the token and its relationship with other tokens in the sequence.
                    SHAPE: [batch_size, src_seq_len, d_model]
        """
        # Get the embeddings for the source sentences.
        src_embeddings = self.src_embedding(src)
        # Add the positional encodings to the embeddings.
        src_embeddings = self.src_positional_encoding(src_embeddings)
        # Pass the source sentence through the encoder.
        encoded_src = self.encoder(input=src_embeddings, mask=src_mask)
        return encoded_src

    def decode(self, tgt: Tensor, encoded_src: Tensor, src_mask: Tensor, tgt_mask: Tensor) -> Tensor:
        """Encodes each token in the target sequence using the information from the source sequences and the 
        assocations between tokens in the target sequences.

        Args:
            tgt (Tensor): A batch of target sequences containing the token ids corresponding to the indices in the tgt vocabulary.
                          SHAPE: [batch_size, tgt_seq_len]
            encoded_src (Tensor): The encoded token representations of the source sequences. This is used to calculate the
                                  source attention scores for the target sentence.
                                  SHAPE: [batch_size, src_seq_len, d_model]
            src_mask (Tensor): Mask to be applied to the source sequences in each of the attention heads. Same mask will be 
                               applied to the sequence in all the attention heads.
                               SHAPE: [batch_size, 1, src_seq_len, src_seq_len]
            tgt_mask (Tensor): Mask to be applied to the target sequences in each of the attention heads. Same mask will be 
                                         applied to the sequence in all the attention heads.
                                         SHAPE: [batch_size, 1, tgt_seq_len - 1, tgt_seq_len - 1]
                               
        Returns:
            Tensor: Encoded (or Decoded if that makes more sense to you) target sequences. Each token in the target 
                    sequence is represented by a vector that encodes all the information about the token and its 
                    relationship with other tokens in the target sequence and the corresponding source sequences.
                    SHAPE: [batch_size, tgt_seq_len - 1, d_model]
        """
        # Get the embeddings for the target sequences.
        tgt_embeddings = self.tgt_embedding(tgt)
        # Add the positional encodings to the embeddings.
        tgt_embeddings = self.tgt_positional_encoding(tgt_embeddings)
        # Pass the target sequence through the decoder.
        decoded_tgt = self.decoder(input=tgt_embeddings, encoded_src=encoded_src, tgt_mask=tgt_mask, src_mask=src_mask)
        return decoded_tgt

    def generate_tgt_token_prob_distributions(self, decoded_tgt: Tensor) -> Tensor:
        """Takes the output of the decoder and generates the probability distribution for each token over the target vocabulary.

        Args:
            decoded_tgt (Tensor): The output of the decoder. Each token in the target sequence is represented by a vector that
                                  encodes all the information about the token and its relationship with other tokens in the 
                                  target sequence and the corresponding source sequences.
                                  SHAPE: [batch_size, tgt_seq_len - 1, d_model]

        Returns:
            Tensor: Log probability distribution over the tokens in the target vocabulary (Telugu vocabulary in this case).
        """
        # Convert the output of the decoder to the probability distribution over the target vocabulary. This will be
        # used to calculate the loss in the training phase.
        return self.token_predictor(decoded_tgt)

In [17]:
# Create an instance of the MachineTranslationTransformer model.
machine_translation_transformer = MachineTranslationModel(d_model=d_model, 
                                                          d_feed_forward=d_feed_forward,
                                                          dropout_prob=dropout_prob, 
                                                          num_heads=num_heads, 
                                                          src_vocab_size=src_vocab_size, 
                                                          tgt_vocab_size=tgt_vocab_size, 
                                                          num_layers=num_layers,
                                                          max_seq_len=max_seq_len)
print(machine_translation_transformer)

MachineTranslationModel(
  (src_embedding): Embeddings(
    (look_up_table): Embedding(6, 8)
  )
  (tgt_embedding): Embeddings(
    (look_up_table): Embedding(6, 8)
  )
  (src_positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (tgt_positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (encoder_layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_attention): MultiHeadedAttention(
          (dropout_layer): Dropout(p=0.1, inplace=False)
          (linear_layers): ModuleList(
            (0-3): 4 x Linear(in_features=8, out_features=8, bias=True)
          )
        )
        (feed_forward): FeedForwardNN(
          (linear_layer_1): Linear(in_features=8, out_features=16, bias=True)
          (linear_layer_2): Linear(in_features=16, out_features=8, bias=True)
          (dropout_layer): Dropout(p=0.1, inplace=False)
        )
        (sublayer_wrappers): ModuleList(
       

In [18]:
# Pass the input data through the model to get the output.
decoded_tgt = machine_translation_transformer(src=src, tgt=tgt, src_mask=src_mask, tgt_mask=tgt_mask)
PrintTensor(input=decoded_tgt, name="decoded_tgt")

name:  decoded_tgt
shape:  torch.Size([2, 4, 6])
tensor([[[-2.8528, -1.0213, -4.5495, -1.1803, -2.0719, -1.9770],
         [-2.2348, -1.3514, -5.0215, -0.7608, -2.2693, -2.8676],
         [-2.8250, -0.5687, -4.7799, -1.3647, -2.7904, -3.0120],
         [-2.9324, -1.1844, -5.0353, -0.6135, -3.3147, -2.8737]],

        [[-2.4639, -0.3513, -4.3406, -2.1864, -3.0747, -3.2306],
         [-2.1477, -0.6888, -4.7810, -1.4282, -2.4217, -3.1191],
         [-3.2905, -0.5929, -4.4174, -1.5028, -2.5175, -2.3557],
         [-3.1885, -0.5061, -4.4754, -1.2970, -3.2141, -3.4739]]],
       grad_fn=<LogSoftmaxBackward0>)
------------------------------------------------------------------------------------------------------------------------------------------------------


## Lets verify if the number of parameters in the model is the same as what we expect.

Counting the number of parameters manually and comparing it with the number of learnable parameters in the <br>
model helps debug lots of issues.

In [19]:
# Finding out the number of parameters in the build model.
total_params = sum(params.numel() for params in machine_translation_transformer.parameters())
print("total_params: ", total_params)
total_params_with_grad = sum(params.numel() for params in machine_translation_transformer.parameters() if params.requires_grad)
print("total_params_with_grad: ", total_params_with_grad)
total_params_without_grad = sum(params.numel() for params in machine_translation_transformer.parameters() if not params.requires_grad)
print("total_params_without_grad: ", total_params_without_grad)
assert total_params == total_params_with_grad + total_params_without_grad

total_params:  9206
total_params_with_grad:  9206
total_params_without_grad:  0


In [20]:
# Prints out all the layers and the number of parameters in each layer.
for name, params in machine_translation_transformer.named_parameters():
    print(name, " ", params.numel())

src_embedding.look_up_table.weight   48
tgt_embedding.look_up_table.weight   48
encoder.encoder_layers.0.self_attention.linear_layers.0.weight   64
encoder.encoder_layers.0.self_attention.linear_layers.0.bias   8
encoder.encoder_layers.0.self_attention.linear_layers.1.weight   64
encoder.encoder_layers.0.self_attention.linear_layers.1.bias   8
encoder.encoder_layers.0.self_attention.linear_layers.2.weight   64
encoder.encoder_layers.0.self_attention.linear_layers.2.bias   8
encoder.encoder_layers.0.self_attention.linear_layers.3.weight   64
encoder.encoder_layers.0.self_attention.linear_layers.3.bias   8
encoder.encoder_layers.0.feed_forward.linear_layer_1.weight   128
encoder.encoder_layers.0.feed_forward.linear_layer_1.bias   16
encoder.encoder_layers.0.feed_forward.linear_layer_2.weight   128
encoder.encoder_layers.0.feed_forward.linear_layer_2.bias   8
encoder.encoder_layers.0.sublayer_wrappers.0.layer_norm.weight   8
encoder.encoder_layers.0.sublayer_wrappers.0.layer_norm.bias   8

### Counting the number of trainable parameters in the transformer model manually

In [21]:
# Lets try to count the number of parameters in the model manually by going through each component and counting 
# parameters. 
# The number of parameters associated with the Embeddings class for src sentences. We have 1 embedding vector 
# per token in the source vocabulary. We have 6 tokens and each token is represented by an 8-dimensional vector. 
# So, the total number of parameters associated with the Embeddings class for src sentences is 6 * 8 = 48.
num_src_embedding_params = src_vocab_size * d_model
# The number of parameters associated with the Embeddings class for tgt sentences. We have 1 embedding vector
# per token in the target vocabulary. We have 6 tokens and each token is represented by an 8-dimensional vector.
# So, the total number of parameters associated with the Embeddings class for tgt sentences is 6 * 8 = 48.
num_tgt_embedding_params = tgt_vocab_size * d_model
# There are no parameters associated with the PositionalEncoding class. These are calculated based on a predefined
# formula and are not learned during the training process.
num_positional_encoding_params = 0
# Now, lets calculate the number of parameters associated with the Encoder class. The Encoder class has 6 
# identical EncoderLayers stacked on top of each other. Lets calculate the number of parameters associated with 
# each EncoderLayer class. The EncoderLayer class has MultiHeadedAttention and FeedForwardNN classes as its child 
# classes. Each MultiHeadedAttention class has 4 linear layers (query, key, value and output). Note that a single 
# linear layer is used to calculate the queries, keys, values and outputs for all the heads. So, we don't need to 
# do this calculation for each head separately. Lets take the linear layer associated with the query calculation. 
# The input to this linear layer is a 8-dimensional vector (d_model) and the output is also an 8-dimensional vector 
# (d_model=8). So, the number of parameters in this linear layer associated with the weight matrix is 8 * 8 = 64. 
# We also have bias terms (d_model=8) associated with this linear layer. So, the total number of parameters 
# associated with the query linear layer is 64 + 8 = 72.
num_encoder_query_params = d_model * d_model + d_model
num_encoder_key_params = d_model * d_model + d_model
num_encoder_value_params = d_model * d_model + d_model
num_encoder_attention_output_params = d_model * d_model + d_model
# Now lets calculate the number of parameters associated with FeedForward neural network class in the EncoderLayer. 
# The first linear layer in the feed forward expands the input to a higher dimension (d_model to d_feed_forward). 
# The input to this linear layer is a 8-dimensional vector (d_model) and the output is a 16-dimensional vector 
# (d_feed_forward). So, the number of parameters in this linear layer associated with the weight matrix is 
# 8 * 16 = 128. We also have bias terms (d_feed_forward=16) associated with this linear layer. So, the total number 
# of parameters associated with the first linear layer in the feed forward neural network is 128 + 16 = 144. The 
# second linear layer in the feed forward neural network compresses the input back to its original dimension 
# (d_feed_forward to d_model). The input to this linear layer is a 16-dimensional vector (d_feed_forward) and the 
# output is an 8-dimensional vector (d_model). So, the number of parameters in this linear layer associated with 
# the weight matrix is 16 * 8 = 128. We also have bias terms (d_model=8) associated with this linear layer. So, 
# the total number of parameters associated with the second linear layer in the feed forward neural network is 
# 128 + 8 = 136.
num_encoder_feed_forward_linear_layer_1_params = d_model * d_feed_forward + d_feed_forward
num_encoder_feed_forward_linear_layer_2_params = d_feed_forward * d_model + d_model
# The output of MultiHeadedAttention and FeedForward neural network is normalized using Layer Normalization. Layer
# Normalization is applied along the last dimension of the input tensor (input to Layer Normalization). Each of the
# features is scaled independently with the learned paramaters. So, the number of parameters is the number of 
# features in the last dimension multiplied by 2 (1 parameter for gamma and 1 parameter for beta per feature). Both 
# the output of MultiHeadedAttention and FeedForward neural network have the same size in the last dimension which 
# is 8 (d_model). So, the number of parameters associated with Layer Normalization layer that is applied after 
# MultiHeadedAttention is 8 (gamma) + 8 (beta) = 16. Similarly, the number of parameters associated with Layer
# Normalization layer that is applied after FeedForward neural network is 8 (gamma) + 8 (beta) = 16.
num_encoder_attention_layer_norm_params = d_model + d_model
num_encoder_feed_forward_layer_norm_params = d_model + d_model
# The total number of parameters associated with a single EncoderLayer is sum of the above 8 variables = 600.
num_encoder_layer_params = num_encoder_query_params + num_encoder_key_params + num_encoder_value_params + num_encoder_attention_output_params + num_encoder_feed_forward_linear_layer_1_params + num_encoder_feed_forward_linear_layer_2_params + num_encoder_attention_layer_norm_params + num_encoder_feed_forward_layer_norm_params
# We also apply Layer Normalization to the output of the last EncoderLayer and pass this as the output of the Encoder.
# This layer again has same number of parameters associated with it as other Layer Normalization layers i.e.,
# 8 (gamma) + 8 (beta) = 16
num_encoder_layer_layer_norm_params = d_model + d_model
# The transformer model has 6 EncoderLayers stacked on top of each other. So, the total number of parameters 
# associated with Encoder is (6 * 600) + 16 = 3616
num_total_encoder_params = (num_layers * num_encoder_layer_params) + num_encoder_layer_layer_norm_params
# Now, lets calculate the number of parameters associated with the DecoderLayer and Decoder.
# The method to calculate number of parameters in the DecoderLayer is very similar to how it was done for the 
# EncoderLayer. DecoderLayer just contains 1 additional MultiHeadedAttention Layer (for source attention) and 1
# additional Layer Normalization layer associated with this source attention layer.
# Same as in EncoderLayer ==> Linear Layer parameters ==> 64 (weights) + 8 (bias) = 72
num_decoder_self_attention_query_params = d_model * d_model + d_model
num_decoder_self_attention_key_params = d_model * d_model + d_model
num_decoder_self_attention_value_params = d_model * d_model + d_model
num_decoder_self_attention_output_params = d_model * d_model + d_model
# The next 4 variables correspond to the 1 additional MultiHeadedAttention layer (src attention) present in the 
# DecoderLayer.
# Same as in EncoderLayer ==> Linear Layer parameters ==> 64 (weights) + 8 (bias) = 72
num_decoder_src_attention_query_params = d_model * d_model + d_model
num_decoder_src_attention_key_params = d_model * d_model + d_model
num_decoder_src_attention_value_params = d_model * d_model + d_model
num_decoder_src_attention_output_params = d_model * d_model + d_model
# The FeedForward neural network is exactly the same as in EncoderLayer.
# Same as in EncoderLayer ==> 8 * 16 + 16 = 144
num_decoder_feed_forward_linear_layer_1_params = d_model * d_feed_forward + d_feed_forward
# Same as in EncoderLayer ==> 8 * 16 + 8 = 136
num_decoder_feed_forward_linear_layer_2_params = d_model * d_feed_forward + d_model
# We have 1 additional LayerNormalization layer associated with the source attention. However, its architecture and
# the parameters are the same as in EncoderLayer.
# Same as in EncoderLayer ==> 8 (gamma) + 8 (beta) = 16
num_decoder_self_attention_layer_norm_params = d_model + d_model
num_decoder_src_attention_layer_norm_params = d_model + d_model
num_decoder_feed_forward_layer_norm_params = d_model + d_model
# The total number of parameters associated with a single DecoderLayer is the sum of the above 13 variables = 
num_decoder_layer_params = num_decoder_self_attention_query_params + num_decoder_self_attention_key_params + num_decoder_self_attention_value_params + num_decoder_self_attention_output_params + num_decoder_src_attention_query_params + num_decoder_src_attention_key_params + num_decoder_src_attention_value_params + num_decoder_src_attention_output_params + num_decoder_feed_forward_linear_layer_1_params + num_decoder_feed_forward_linear_layer_2_params + num_decoder_self_attention_layer_norm_params + num_decoder_src_attention_layer_norm_params + num_decoder_feed_forward_layer_norm_params
# We also apply Layer Normalization to the output of the last DecoderLayer and pass this as the output of the Decoder. 
# This layer again has same number of parameters associated with it as other Layer Normalization layers i.e.,
# 8 (gamma) + 8 (beta) = 16
num_decoder_layer_layer_norm_params = d_model + d_model
# The transformer model has 6 EncoderLayers stacked on top of each other. So, the total number of parameters 
# associated with Encoder is (6 * 904) + 16 = 5440
num_total_decoder_params = (num_layers * num_decoder_layer_params) + num_decoder_layer_layer_norm_params
# The output of the decoder is passed to a linear layer that projects the output to the target vocabulary space.
# These parameters are associated with the TokenPredictor layer in the transformer above. The input to the linear
# layer are 8-dimensional vectors (d_model) and output of the linear layers are 6-dimensional vectors (tgt_vocab_size).
# So, the number of parameters associated with the TokenPredictor layer is 8 * 6 (weights) + 6 (bias) = 54
num_vocab_projection_params = d_model * tgt_vocab_size + tgt_vocab_size
# Finally, the total number of parameters in the model is the number of parameters associated with the Embeddings plus
# the number of parameters in the Encoder plus the number of parameters in the Decoder plus the number of parameters
# in the TokenPredictor.
num_total_model_params = num_src_embedding_params + num_tgt_embedding_params + num_total_encoder_params + num_total_decoder_params + num_vocab_projection_params
print("Total Number of parameters associated with the model: ", num_total_model_params)


Total Number of parameters associated with the model:  9206


In [22]:
# So, the number of parameters in the model we built is the same as the number of parameters expected within the model.
# Usually, counting the number of parameters could make several bugs in the code visible and a good check to implement
# for any machine learning model.
assert total_params_with_grad == num_total_model_params