In [2]:
# In this notebook, you learn
# 
# 1) How to train the MachineTranslation transformer model?

In [4]:
import copy
import datasets
import math
import spacy
import torch
from abc import ABC, abstractmethod
from dataclasses import dataclass
from torch import nn, Tensor
from typing import Optional, Tuple
from torch.utils.data import Dataset, DataLoader, Sampler
from torchtext.vocab import build_vocab_from_iterator
from typing import Generator, Callable
import random

In [5]:
# In this notebook, we are going to combine the components from all the previous notebooks to train the MachineTranslation 
# transformer model. So, it's going to be a long notebook. You can skip the parts that are copied from the previous
# notebooks and directly go to the training part. Also, in this notebook, we will use dummy data and avoid all the data
# processing to keep it relatively smaller. We will use the in the model.

In [6]:
# Id of the padding token.
PAD_TOKEN_ID = 2
# Id of the end of sentence token.
EOS_ID = 1
# Probability assigned to all other tokens (excluding correct token) while applying label smoothing.
SMOOTHING_PROB = 0.1
# Number of steps for which the learning rate increases linearly during training.
NUM_WARMUP_STEPS = 400
# Size of the word embeddings and all other vectors in the model.
D_MODEL = 512
# Number of tokens in the source (English) vocabulary.
SRC_VOCAB_SIZE = 40000
# Number of tokens in the target (Telugu) vocabulary.
TGT_VOCAB_SIZE = 40000
# Number of neurons in the hidden layer within the Feed Forward neural network of the transformer model.
D_FEED_FORWARD = 2048
# Number of heads in the attention layer of the transformer model.
NUM_HEADS = 8
# Maximum length of the sequence that is ever input to the model. Model will break if a larger sequence
# is provided as input.
MAX_SEQ_LEN = 250
# Probability with which the neurons (or data) is dropped.
DROPOUT_PROB = 0.1
# Number of EncoderLayers in the encoder and number of DecoderLayers in the decoder.
NUM_LAYERS = 6
# Learning rate at the start of the model training. This is adjusted periodically during model training.
INITIAL_LEARNING_RATE = 0.01
# Hyperparameter to calculate the m1 moment in the optimizer. This roughly corresponds to averaging over the
# last 10 (1/(1-beta_1)) sets of gradients. This comes from 'Gradient Descent with Momentum' algorithm.
BETA_1 = 0.9
# Hyperparameter to calculate the m1 moment in the optimizer. This roughly corresponds to averaging over the
# last 50 (1/(1-beta_2)) sets of gradients. This comes from 'RMS prop' algorithm.
BETA_2 = 0.98
# Small value to avoid division by zero in the optimizer.
EPSILON = 1e-8
# Number of epochs to train the model on.
NUM_EPOCHS = 1
# Directory to save the model after every check point.
MODEL_CHECK_POINTS_PATH = "../../Data/trained_model_checkpoints"
# Path to the datasets.
AI4_BHARAT_DATA_PATH = "../../Data/AI4Bharat"

In [26]:
# EVERYTHING IN THIS CELL HAS BEEN EXPLAINED IN DETAIL IN THE PREVIOUS NOTEBOOKS. PLEASE REFER TO THE EARLIER
# NOTEBOOKS TO UNDERSTAND THE CODE IN THIS CELL. YOU CAN SKIP (JUST RUN IT BLINDLY) THIS CELL AND MOVE TO THE 
# NEXT CELL DIRECTLY IN THIS NOTEBOOK. 

# Refer to 'step_8_word_embeddings.ipynb' notebook to learn more about the Embeddings class.
class Embeddings(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        """Creates the embedding layer that serves as a look-up table for the tokens in the transformer model.

        Args:
            vocab_size (int): Size of the vocabulary i.e., number of distinct tokens in the vocabulary.
            embedding_dim (int): The size of the embedding vector to be generated for each token.
        """
        super(Embeddings, self).__init__()
        self.look_up_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

    # The input is be a '2D' tensor where each '1D' tensor within the '2D' tensor is the list
    # of indices corresponding to the tokens in the vocab.
    # [[0, 123, 3455, 4556, 7, 1, 2, 2], [0, 56, 98, 6234, 909, 56, 1, 2]]
    # 0 - <SOS>, 1 - <eos>, 2 - <pad>
    def forward(self, input: Tensor) -> Tensor:
        """Converts the input tensor of token indices to their corresponding embedding vectors.

        Args:
            input (Tensor): The input tensor of token indices.
                            shape: [batch_size, seq_len]

        Returns:
            Tensor: The tensor of embedding vectors for the corresponding input tokens.
                    shape: [batch_size, seq_len, embedding_dim]
        """
        # There is no reasoning as to why the original 'attention_is_all_you_need' paper scaled the
        # embeddings using 'math.sqrt(embedding_dim)'. A few blogs attempted to explain this reasoning,
        # but I haven't found anything with solid reasoning.
        return self.look_up_table(input) * math.sqrt(self.embedding_dim)


# Refer to 'step_10_positional_encoding.ipynb' notebook to learn more about the PositionalEncoding class.
class PositionalEncoding(nn.Module):
    # d_model is the same as encoding_size.
    def __init__(self, encoding_size: int, dropout_prob: float, max_len: int = 5000):
        """Creates the positional encodings.

        Args:
            encoding_size (int): Size of the positional encoding vector that represents the position of the token.
            dropout_prob (float): Probability of an element to be zeroed or dropped.
            max_len (int, optional): Largest position for which the positional encoding vector is generated. Defaults to 5000.
                                     By default, it generates positional encodings for the first 5000 positions.
        """
        super().__init__()
        # Refer to step_8_drop_out.ipynb notebook (link to the notebook) to understand more about dropout.
        self.dropout = nn.Dropout(p=dropout_prob, inplace=False)
        # Compute the positional encodings in log space.
        positional_encoding = torch.zeros(size=(max_len, encoding_size), dtype=torch.float)
        positional_encoding_numerators = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        numerators_in_exponent = torch.arange(0, encoding_size, 2, dtype=torch.float)
        positional_encoding_denominators = torch.exp(numerators_in_exponent * (-math.log(10000.0) / encoding_size))
        positional_encoding[:, 0::2] = torch.sin(positional_encoding_numerators * positional_encoding_denominators)
        positional_encoding[:, 1::2] = torch.cos(positional_encoding_numerators * positional_encoding_denominators)
        # Refer to understanding_tensor_manipulations_part_1.ipynb notebook (link to the notebook) to
        # understand more about unsqueeze operation in pytorch.
        # In transformer model, we receive 3D tensors as input to this module. Each 1D tensor
        # in the last dimension is an embedding for the token. Each 2D tensor is a sentence.
        # The entire 3D tensor is a batch of sentences. To work with 3D tensors in the forward
        # method, we convert the positional encoding to a 3D tensor.
        positional_encoding = positional_encoding.unsqueeze(0)
        # Refer to using_modules.ipynb (link to the notebook) to understand more about buffers in pytorch.
        # This tells the module to not update the positional encoding tensor during the training. It is 
        # not a trainable parameter but it is still part of the state of the model.
        self.register_buffer('positional_encoding', positional_encoding)
    
    def forward(self, input: Tensor) -> Tensor:
        """Adds the positional encodings to the input tensor.
        Args:
            input (Tensor): The input tensor containing the embeddings of the tokens.
                            shape: [batch_size, sentence_length, d_model]

        Returns:
            Tensor: Input with the positional encodings added to it.
                    shape: [batch_size, sentence_length, d_model]
                    d_model is the same as encoding_size.
        """
        # Refer to understanding_tensor_manipulations_part_5.ipynb notebook (link to the notebook) to 
        # understand more about broadcasting in python.
        # The input tensor is a 3D tensor of shape (batch_size, sentence_length, encoding_size).
        # We add (uses broadcasting) the positional encoding to the input tensor to get the final tensor.
        # positional_encoding: (1, max_len, encoding_size) --> (1, sentence_length, encoding_size) 
        #       -- Extracts the positional encodings for the sentence_length from the positional_encoding 
        #          tensor.
        # (batch_size, sentence_length, encoding_size) --> input
        # (batch_size, sentence_length, encoding_size) --> Resultant tensor shape after broadcasting.
        # requires_grad_(False) is not needed since the positional encoding is already registered
        # as a Buffer and not a trainable parameter. It is just included for clarity.
        input = input + self.positional_encoding[:, :input.size(1)].requires_grad_(False)
        return self.dropout(input)
    

# Creates a copy (deepcopy) of the module and returns ModuleList containing the copies.
def clone_module(module: nn.Module, num_clones: int) -> nn.ModuleList:
    return nn.ModuleList([copy.deepcopy(module) for _ in range(num_clones)])


# Refer to 'step_11_multi_headed_attention.ipynb' notebook to understand how this function works.
def construct_attention_heads(queries: Tensor, keys: Tensor, values: Tensor, mask: Optional[Tensor]=None, dropout_layer: Optional[nn.Module]=None) -> Tuple[Tensor, Tensor]:
    """Calculates the attention scores for each token in the sequence with every other token in the sequence.
       Applues the mask if provided and then normalizes the scores using softmax. It then calculates the 
       attention heads for each token in the sequence.

    Args:
        queries (Tensor): [batch_size, num_heads, seq_len, d_k]
        keys (Tensor): [batch_size, num_heads, seq_len, d_k]
        values (Tensor): [batch_size, num_heads, seq_len, d_k]
        mask (Optional[Tensor], optional): [batch_size, 1, seq_len, seq_len]. Defaults to None.
        dropout_layer (Optional[nn.Module], optional): probability with which the values are dropped on dropout layer. Defaults to None.

    Returns:
        Tuple[Tensor, Tensor]: Returns the attention heads and the attention scores.
                               attention_heads: [batch_size, num_heads, seq_len, d_k]
                               attention_scores: [batch_size, num_heads, seq_len, seq_len]
    """
    # Size of the vectors for each token for each head in the sequence.
    d_k = queries.shape[-1]
    # Calculate the attention scores for each token in the sequence with every other token in the sequence.
    attention_scores = torch.matmul(queries, keys.transpose(dim0=2, dim1=3)) / math.sqrt(d_k)
    # Mask the attention scores if a mask is provided. Mask is used in two different ways:
    # 1) To prevent the model from attending to the padding tokens --> This applies for both src and tgt sentences.
    # 2) To prevent the model from attending to the future tokens in the sequence --> This applies only for tgt sentences.
    if mask is not None:
        # Please do not set the masked values to float('-inf') as it sometimes (not in everycase) causes softmax to return nan.
        attention_scores = attention_scores.masked_fill(mask == False, float('-1e9'))
    # Normalize the attention scores using softmax.
    attention_scores = attention_scores.softmax(dim=-1)
    # Apply dropout regularization to prevent overfitting problems.
    if dropout_layer is not None:
        dropout_layer(attention_scores)
    # Calculate the attention heads for each token in the sequence. The head for each token is calculated by
    # taking the weighted average (averaged by attention scores) of the values for all the tokens in the 
    # sequence for the token of interest.
    attention_heads = torch.matmul(attention_scores, values)
    return attention_heads, attention_scores


# Refer to 'step_11_multi_headed_attention.ipynb' notebook to understand how this class works.
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_heads: int, d_model: int, dropout_prob: float = 0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads."
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        # We use dropout to prevent overfitting.
        self.dropout_layer = nn.Dropout(p=dropout_prob)
        # Creating the linear layers that generate queries, keys and values for each token in the sequence.
        # Also, creating an additional linear layer to generate the output of the Multi-Headed Attention from concatenated attention heads.
        self.linear_layers = clone_module(module=nn.Linear(in_features=d_model, out_features=d_model), num_clones=4)


    def forward(self, query_input: Tensor, key_input: Tensor, value_input: Tensor, mask: Optional[Tensor]=None) -> Tensor:
        """Forward pass of the Multi-Headed Attention layer. 

        Args:
            query (Tensor): Input to be used for query creation.
                            query_input: [batch_size, seq_len, d_model]
            key (Tensor): Input to be used for key creation.
                          key_input  : [batch_size, seq_len, d_model]
            value (Tensor): Input to be used for value creation.
                            value_input: [batch_size, seq_len, d_model]
            mask (Tensor): Mask to be applied to the attention scores. Default is None. Same mask will 
                           be applied to all the heads in the Multi-Headed Attention layer.
                           mask: [batch_size, 1, seq_len, seq_len]

        Returns:
            Mutli-Headed Attention Output: Output of the Multi-Headed Attention layer. Generates one output vector 
                                           for each token in the sequence. Does this for each sequence in the batch.
                                           output: [batch_size, seq_len, d_model]
        """
        # Generates the queries, keys and values for each token in the sequence.
        # shape of queries, keys, values: [batch_size, seq_len, d_model]
        queries, keys, values = [linear_layer(input) for linear_layer, input in zip(self.linear_layers, (query_input, key_input, value_input))]
        batch_size = query_input.shape[0]
        seq_len = query_input.shape[1]
        # Separating the queries, keys and values for each head into a separate vector. The vectors for each token in all the heads
        # are concatenated when they are created using the linear_layers above.
        # Shape for queries, keys, values after view: [batch_size, seq_len, num_heads, d_k]
        # Shape for queries, key, values after transpose: [batch_size, num_heads, seq_len, d_k]
        queries, keys, values = [data.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(dim0=1, dim1=2) for data in (queries, keys, values)]
        # Calculate the attention heads for each token in the sequence.
        # attention_heads: [batch_size, num_heads, seq_len, d_k]
        attention_heads, attention_scores = construct_attention_heads(queries=queries, keys=keys, values=values, mask=mask, dropout_layer=self.dropout_layer)
        # Concatenate the attention heads for each token from all the heads.
        # attention_heads: [batch_size, seq_len, d_model]
        attention_heads = attention_heads.transpose(dim0=1, dim1=2).reshape(batch_size, seq_len, self.d_model)
        # Generate the output of the Multi-Headed Attention layer.
        return self.linear_layers[-1](attention_heads)
    

# Refer to 'step_12_feed_forward_neural_network.ipynb' notebook to understand how this class works.
class FeedForwardNN(nn.Module):
    def __init__(self, d_model: int, d_feed_forward: int, dropout_prob: float = 0.1):
        super().__init__()
        self.linear_layer_1 = nn.Linear(in_features=d_model, out_features=d_feed_forward)
        self.linear_layer_2 = nn.Linear(in_features=d_feed_forward, out_features=d_model)
        self.dropout_layer = nn.Dropout(p=dropout_prob)

    def forward(self, input: Tensor) -> Tensor:
        """Passes the input through the Feed Forward Neural Network and returns the output 
           of the neural network.

        Args:
            input (Tensor): The output of the Multi-Headed Attention layer.
                            shape: [batch_size, seq_len, d_model]

        Returns:
            Tensor: The output of the Feed Forward Neural Network.
                    shape: [batch_size, seq_len, d_model]
        """
        # We first expand the input to higher dimension. We apply the ReLU activation function in this layer.
        intermediate_output = self.linear_layer_1(input).relu()
        # Dropout layer to prevent overfitting
        intermediate_output = self.dropout_layer(intermediate_output)
        # We then compress the input back to its original dimension. There is no specific intuitive explanation 
        # as to why this is done. It is just shown to be working practically in neural networks in general and 
        # in this paper in particular.
        return self.linear_layer_2(intermediate_output)
    

# Refer to 'step_14_encoder.ipynb' to understand how this class works.
class SubLayerWrapper(nn.Module):
    def __init__(self, d_model: int, dropout_prob: float):
        """This class is a wrapper around the MultiHeadedAttention and PositionwiseFeedForward classes.

        Args:
            d_model (int): Dimension of the vectors used in the Attention model.
            dropout_prob (float): probability with which nodes can be dropped.
        """
        super().__init__()
        self.dropout = nn.Dropout(dropout_prob)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, input: Tensor, sublayer: nn.Module) -> Tensor:
        """It applies the operation on the input, applies dropout, adds the input back to the transformed 
           input, does normalization and returns the output.

        Args:
            input (Tensor): Input to be transformer by the sublayer.
                            input: [batch_size, seq_len, d_model]
            sublayer (nn.Module): sublayer could be either MultiHeadedAttention or PositionwiseFeedForward.
            
        Returns:
            Tensor: Output of the sublayer transformation.
                    output: [batch_size, seq_len, d_model]
        """
        return self.layer_norm(input + self.dropout(sublayer(input)))


# Refer to 'step_14_encoder.ipynb' to understand how this class works.
class EncoderLayer(nn.Module):
    def __init__(self, self_attention: MultiHeadedAttention, feed_forward: FeedForwardNN, d_model: int, dropout_prob: float):
        super().__init__()
        self.d_model = d_model
        self.dropout_prob = dropout_prob
        # These modules are now the child modules of the EncoderLayer and will be registered as parameters of the EncoderLayer.
        self.self_attention = self_attention
        self.feed_forward = feed_forward
        self.sublayer_wrappers = clone_module(module=SubLayerWrapper(d_model=self.d_model, dropout_prob=self.dropout_prob), num_clones=2)

    def forward(self, input: Tensor, mask: Tensor) -> Tensor:
        """This method is the forward pass of the EncoderLayer class.

        Args:
            input (Tensor): Source sentence provided as input to the EncoderLayer. These are the embeddings of the source 
                            sentence for the first EncoderLayer.
                            SHAPE: [batch_size, seq_len, d_model]
            mask (Tensor): Boolean mask to be applied to the input during attention scores calculation.
                           SHAPE: [batch_size, 1, seq_len, seq_len]
        Returns:
            Tensor: Output of the EncoderLayer.
                    SHAPE: [batch_size, seq_len, d_model]
        """
        # We are just saving the function call to the self_attention method in a variable and passing the
        # lambda function (contained within the variable) to the sublayer_wrappers[0] to execute it when 
        # needed.
        output = self.sublayer_wrappers[0](input, lambda input: self.self_attention(query_input=input, key_input=input, value_input=input, mask=mask))
        return self.sublayer_wrappers[1](output, self.feed_forward)


# Refer to 'step_14_encoder.ipynb' to understand how this class works.
class Encoder(nn.Module):
    def __init__(self, encoder_layer: EncoderLayer, num_layers: int):
        super().__init__()
        self.encoder_layers = clone_module(module=encoder_layer, num_clones=num_layers)
        self.layer_norm = nn.LayerNorm(encoder_layer.d_model)

    def forward(self, input: Tensor, mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the Encoder class. The output of the current EncoderLayer is
           passed as input to the next EncoderLayer. We have 6 identical EncoderLayers stacked on top of 
           each other. The output of the last EncoderLayer is passed through a Layer Normalization layer
           and returned as the final output of the Encoder

        Args:
            input (Tensor): Input to the Encoder i.e., embeddings of the tokenized src sequences.
                            input: [batch_size, seq_len, d_model]
            mask (Optional[Tensor], optional): Boolean mask to be applied during attention scores calculation.
                                               mask: [batch_size, 1, seq_len, seq_len]. Defaults to None.
                            
        Returns:
            Tensor: Output of the Encoder i.e., encoded src sentences.
                    output: [batch_size, seq_len, d_model]
        """
        output = input
        for encoder_layer in self.encoder_layers:
            # Pass the output of the previous EncoderLayer to the current EncoderLayer.
            output = encoder_layer(input=output, mask=mask)
        return self.layer_norm(output)


# Refer to 'step_15_decoder.ipynb' to understand how this class works.
class DecoderLayer(nn.Module):
    def __init__(self, self_attention: MultiHeadedAttention, src_attention: MultiHeadedAttention, feed_forward: FeedForwardNN, d_model: int, dropout_prob: float):
        super().__init__()
        self.d_model = d_model
        self.dropout_prob = dropout_prob
        # These modules are now the child modules of the DecoderLayer and will be registered as parameters of the DecoderLayer.
        self.self_attention = self_attention
        self.src_attention = src_attention
        self.feed_forward = feed_forward
        self.sublayer_wrappers = clone_module(module=SubLayerWrapper(d_model=d_model, dropout_prob=dropout_prob), num_clones=3)

    def forward(self, input: Tensor, encoded_src: Tensor, tgt_mask: Tensor, src_mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the DecoderLayer class.

        Args:
            input (Tensor): Target sentence provided as input to the DecoderLayer. These are the embeddings of the target 
                            sentence for the first DecoderLayer.
                            SHAPE: [batch_size, seq_len, d_model]
            encoded_src (Tensor): Encoded source sentence. This is the output of the Encoder. This is used to calculate the
                                  source attention scores for the target sentence. 
                                  SHAPE: [batch_size, seq_len, d_model] 
            tgt_mask (Tensor): Mask to prevent the future tokens in the target sentence to attend to the previous tokens and
                               also to prevent padding tokens from attending to any other token except other padding tokens.
                               SHAPE: [batch_size, 1, seq_len, seq_len]
            src_mask (Optional[Tensor], optional): Mask to prevent the the padding tokens to attend to the tokens in the tgt sentence. 
                                                   Defaults to None.
                                                   SHAPE: [batch_size, 1, seq_len, seq_len]

        Returns:
            Tensor: Returns the output of the DecoderLayer. This is the output of the Positionwise FeedForward Neural Network.
                    SHAPE: [batch_size, seq_len, d_model]
        """
        # First sublayer: Self-Attention on the target sentence. Hence, it uses the tgt_mask.
        self_attention_output = self.sublayer_wrappers[0](input=input, sublayer=lambda input: self.self_attention(query_input=input, key_input=input, value_input=input, mask=tgt_mask)) 
        # To give intuition about src_attention, I have a query for a token in the target sentence. I want to know whether 
        # some token in the source sentence is important for me to predict the output for the token in the target sentence. 
        # So, I go to the source sentence and get the values for all the tokens in the source sentence. I then calculate 
        # the attention scores between the query (in tgt) and the keys (in src). I then calculate the attention heads for 
        # the token in the target sentence using the attention scores. This is what is done in the below line. Note that 
        # referring to statement 'the keys and values are from the source' doesn't mean that you get keys and values 
        # explicitly. It means we use the encoded data from the source sentence to calculate the queries and keys for 
        # this transformation.
        # Second sublayer: Attention on the source sentence. Hence, it uses the src_mask.
        src_attention_output = self.sublayer_wrappers[1](input=self_attention_output, sublayer=lambda self_attention_output: self.src_attention(query_input=self_attention_output, key_input=encoded_src, value_input=encoded_src, mask=src_mask))
        # Third sublayer: Positionwise FeedForward Neural Network
        return self.sublayer_wrappers[2](input=src_attention_output, sublayer=self.feed_forward)


# Refer to 'step_15_decoder.ipynb' to understand how this class works.
class Decoder(nn.Module):
    def __init__(self, decoder_layer: DecoderLayer, num_layers: int):
        super().__init__()
        self.decoder_layers = clone_module(module=decoder_layer, num_clones=num_layers)
        self.layer_norm = nn.LayerNorm(decoder_layer.d_model)

    def forward(self, input: Tensor, encoded_src: Tensor, tgt_mask: Tensor, src_mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the Decoder class. The output of the current DecoderLayer is
           passed as input to the next DecoderLayer. We have 6 identical DecoderLayers stacked on top of 
           each other. The output of the Encoder (last EncoderLayer) is also passed as input to the 
           first DecoderLayer. The output of the last DecoderLayer is passed through a Layer Normalization 
           layer and returned as the final output of the Decoder.

        Args:
            input (Tensor): Input to the Decoder i.e., embeddings of the tokenized tgt sequences.
                            input: [batch_size, seq_len, d_model]
            encoded_src (Tensor): output of the encoder i.e., encoded src sequences.
            tgt_mask (Tensor): Boolean mask to be applied during self attention scores calculation.
                               tgt_mask: [batch_size, 1, seq_len, seq_len].
            src_mask (Optional[Tensor], optional): Boolean mask to be applied during src attention scores calculation.
                                                   tgt_mask: [batch_size, 1, seq_len, seq_len]. Defaults to None.

        Returns:
            Tensor: Output of the Decoder.
                    output: [batch_size, seq_len, d_model]
        """
        output = input
        for decoder_layer in self.decoder_layers:
            # Pass the output of the previous DecoderLayer to the current DecoderLayer.
            output = decoder_layer(input=output, encoded_src=encoded_src, tgt_mask=tgt_mask, src_mask=src_mask)
        return self.layer_norm(output)


# Refer to 'step_14_token_predictor.ipynb' to understand how this class works.
class TokenPredictor(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super(TokenPredictor, self).__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.linear = nn.Linear(in_features=d_model, out_features=vocab_size)
        # The non-module variables are not added to the list of parameters of the model.
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, decoder_output: Tensor) -> Tensor:
        """The forward pass of the token predictor. Calculates the probability distribution over the 
           vocabulary. Each token vector has a corresponding probability distribution over the 
           vocabulary since we predict one token per output.

        Args:
            decoder_output (Tensor): Output of the Decoder.
                                     shape: [batch_size, seq_len, d_model]

        Returns:
            Tensor: Log probability distribution over the vocabulary. 
                    shape: [batch_size, seq_len, vocab_size]
        """
        # Project the decoder output to the vocab_size dimensional space.
        logits = self.linear(decoder_output)
        # Convert the logits to a probability distribution over the vocabulary. All the entires in the
        # output tensor are negative since we are using log softmax. The log softmax is used to make
        # the training more numerically stable. However, the maximum value is still the same as the 
        # maximum value of the original softmax output.
        return self.log_softmax(logits)
    

# Refer to 'step_17_translation_transformer.ipynb' notebook to understand how this class works.
class MachineTranslationTransformer(nn.Module):
    """Model that combines the Encoder, Decoder and the TokenPredictor to create a machine translation Transformer model."""

    def __init__(self, d_model: int, d_feed_forward: int, dropout_prob: float, num_heads: int, src_vocab_size: int, tgt_vocab_size: int, num_layers: int, max_seq_len: int):
        """Initializes the Transformer model.

        Args:
            d_model (int): size of the embedding vectors in the model.
            d_feed_forward (int): Number of neurons in the hidden layer of the feed forward neural network.
            dropout_prob (float): probability with which to drop data in the transformer model.
            num_heads (int): number of attention heads in each of the multi-head attention layers in the model.
            src_vocab_size (int): size of the source vocabulary.
            tgt_vocab_size (int): size of the target vocabulary.
            num_layers (int): number of layers in the Encoder and Decoder.
            max_seq_len (int): Maximum length of the sequence that is ever input to the model.
        """
        super(MachineTranslationTransformer, self).__init__()
        self.src_embedding = Embeddings(vocab_size=src_vocab_size, embedding_dim=d_model)
        self.tgt_embedding = Embeddings(vocab_size=tgt_vocab_size, embedding_dim=d_model)
        self.positional_encoding = PositionalEncoding(encoding_size=d_model, dropout_prob=dropout_prob, max_len=max_seq_len)
        multi_headed_attention = MultiHeadedAttention(num_heads=num_heads, d_model=d_model, dropout_prob=dropout_prob)
        feed_forward_nn = FeedForwardNN(d_model=d_model, d_feed_forward=d_feed_forward, dropout_prob=dropout_prob)
        encoder_layer = EncoderLayer(self_attention=copy.deepcopy(multi_headed_attention), 
                                     feed_forward=copy.deepcopy(feed_forward_nn), 
                                     d_model=d_model, dropout_prob=dropout_prob)
        decoder_layer = DecoderLayer(self_attention=copy.deepcopy(multi_headed_attention), 
                                     src_attention=copy.deepcopy(multi_headed_attention),
                                     feed_forward=copy.deepcopy(feed_forward_nn), 
                                     d_model=d_model, dropout_prob=dropout_prob)
        self.encoder = Encoder(encoder_layer=encoder_layer, num_layers=num_layers)
        self.decoder = Decoder(decoder_layer=decoder_layer, num_layers=num_layers)
        self.output_generator = TokenPredictor(d_model=d_model, vocab_size=tgt_vocab_size)
        self.initialize_model_parameters()

    def initialize_model_parameters(self):
        """Initializes the parameters of the model using the Xavier Uniform initialization."""
        for params in self.parameters():
            # This is to ensure the only the weights are initialized and not the biases. biases usually have only
            # one dimension and the weights have more than one dimension.
            if params.dim() > 1:
                nn.init.xavier_uniform_(params)

    def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor, tgt_mask: Tensor) -> Tensor:
        """The forward pass of the Transformer model. The source sentences are passed through the Encoder and the target
           sentences are passed through the Decoder. The output of the Decoder is passed through the token predictor to
           get the probability distribution over the target vocabulary.

        Args:
            src (Tensor): Source sentences (English) containing the token ids corresponding to the indices in the src vocabulary. 
                          Example input looks like [[0, 4, 55, 67, 1, 2, 2], [0, 42, 585, 967, 19, 26, 1]]
                          SHAPE: [batch_size, seq_len]
            tgt (Tensor): Target sentences (Telugu) containing the token ids corresponding to the indices in the src vocabulary. 
                          Example input looks like [[0, 3, 5, 677, 81, 1, 2], [0, 7, 67, 190, 3245, 1]]
                          SHAPE: [batch_size, seq_len]
            src_mask (Tensor): Mask to be applied to the source sentences in each of the attention heads.
                               src_mask: [batch_size, 1, seq_len, seq_len]
            tgt_mask (Tensor): Mask to be applied to the target sentences in each of the attention heads.
                               tgt_mask: [batch_size, 1, seq_len, seq_len]

        Returns:
            Tensor: Log probability distribution over the tokens in the target vocabulary (Telugu vocabulary).
                    output: [batch_size, seq_len, tgt_vocab_size]
        """
        # Pass the source sentences through the encoder to get the encoded source token vectors.
        encoded_src = self.encode(src=src, src_mask=src_mask)
        # Pass the target sentence through the decoder to get the encoded target token vectors.
        decoded_tgt = self.decode(tgt=tgt, tgt_mask=tgt_mask, encoded_src=encoded_src, src_mask=src_mask)
        return self.generate_tgt_token_prob_distributions(decoded_tgt=decoded_tgt)

    def encode(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """Encodes the source sentences (English).

        Args:
            src (Tensor): A batch of source sentences containing the token ids corresponding to the indices in the src vocabulary.
                          SHAPE: [batch_size, seq_len]
            src_mask (Tensor): Mask to be applied to the source sentences in each of the attention heads. Same mask will be 
                               applied to the sentence in all the attention heads.
                               SHAPE: [batch_size, 1, seq_len, seq_len]

        Returns:
            Tensor: Encoded source sentences. Each token in the source sentence is represented by a vector that encodes
                    all the information about the token and its relationship with other tokens in the sentence.
                    SHAPE: [batch_size, seq_len, d_model]
        """
        # Get the embeddings for the source sentences.
        src_embeddings = self.src_embedding(src)
        # Add the positional encodings to the embeddings.
        src_embeddings = self.positional_encoding(src_embeddings)
        # Pass the source sentence through the encoder.
        encoded_src = self.encoder(input=src_embeddings, mask=src_mask)
        return encoded_src

    def decode(self, tgt: Tensor, tgt_mask: Tensor, encoded_src: Tensor, src_mask: Tensor) -> Tensor:
        """Encodes the target sentences (Telugu).

        Args:
            tgt (Tensor): A batch of target sentences containing the token ids corresponding to the indices in the tgt vocabulary.
                          SHAPE: [batch_size, seq_len]
            tgt_mask (Tensor): Mask to be applied to the target sentences in each of the attention heads. Same mask will be 
                               applied to the sentence in all the attention heads.
                               SHAPE: [batch_size, 1, seq_len, seq_len]
            encoded_src (Tensor): The encoded token representations of the source sentences. This is used to calculate the
                                  source attention scores for the target sentence.
                                  SHAPE: [batch_size, seq_len, d_model]
            src_mask (Tensor): Mask to be applied to the source sentences in each of the attention heads. Same mask will be 
                               applied to the sentence in all the attention heads.
                               SHAPE: [batch_size, 1, seq_len, seq_len]

        Returns:
            Tensor: Encoded (or Decoded if that makes more sense) target sentences. Each token in the target sentence is 
                    represented by a vector that encodes all the information about the token and its relationship with
                    other tokens in the target sentence and the corresponding source sentence.
        """
        # Get the embeddings for the target sentences.
        tgt_embeddings = self.tgt_embedding(tgt)
        # Add the positional encodings to the embeddings.
        tgt_embeddings = self.positional_encoding(tgt_embeddings)
        # Pass the target sentence through the decoder.
        decoded_tgt = self.decoder(input=tgt_embeddings, encoded_src=encoded_src, tgt_mask=tgt_mask, src_mask=src_mask)
        return decoded_tgt

    def generate_tgt_token_prob_distributions(self, decoded_tgt: Tensor) -> Tensor:
        # Convert the output of the decoder to the probability distribution over the target vocabulary. This will be
        # used to calculate the loss in the training phase.
        return self.output_generator(decoded_tgt)


# Refer to 'step_16_label_smoothing.ipynb' notebook to understand how this class works.
class LabelSmoothing(nn.Module):
    def __init__(self, vocab_size: int, padding_idx: int, smoothing: Optional[int]=0.1):
        super(LabelSmoothing, self).__init__()
        # Number of classes in the classification problem. It is the size of the vocabulary in transformers.
        self.vocab_size = vocab_size
        # Index of the padding token or the class label for the padding token.
        self.padding_idx = padding_idx
        # Amount of probability to be shared among the tokens excluding correct token and padding tokens.
        self.smoothing = smoothing
        # Amount of probability shared with the correct token.
        self.confidence = 1 - smoothing
    
    def forward(self, targets: Tensor) -> Tensor:
        """Calculates the smoothed probabilities for each of the target tokens within each sentence.

        Args:
            targets (Tensor): The target tensor containing the correct class labels (expected token indices from the 
                              vocab) for each token in the batch. An example target tensor for a batch of 2 sentences
                              each with 8 tokens and 6 possible classes for prediction (including the padding token)
                              would be: [[0, 3, 4, 5, 5, 1, 2, 2], [1, 5, 3, 3, 4, 0, 0, 2]]
                              shape: [batch_size, seq_len]

        Returns:
            Tensor: A smoothed probability distribution (1D tensor) for each target token in the batch.
                    shape: [batch_size, seq_len, vocab_size]                    
        """
        batch_size, seq_len = targets.shape
        # Creating a tensor that will hold the smoothed probabilities for each target token in all the sentences.
        smoothed_probs = torch.zeros(size=(batch_size, seq_len, self.vocab_size), dtype=torch.float32)
        # Filling the entire tensor with the smoothing probability. We will deal with the probabilities of the
        # correct token and padding token later.
        smoothed_probs = smoothed_probs.fill_(value=self.smoothing / (self.vocab_size - 2))
        # Bringing the targets tensor to contain the same number of dimensions as the smoothed_probs tensor to use
        # it with the scatter_ function inorder replace the probabilities in the smoothed_probs tensor in the next 
        # step.
        unsqueezed_targets = targets.unsqueeze(dim=-1)
        # Replacing the probabilities in the smoothed_probs tensor with the confidence probability at the 
        # positions that correspond to the correct class labels (expected output tokens in the target).
        smoothed_probs.scatter_(dim=-1, index=unsqueezed_targets, value=self.confidence)
        # The padding token should not be predicted at all by the model. So, the probability associated with the
        # class label that correspond to the padding token within each target token distribution should be 0. 
        smoothed_probs[:, :, self.padding_idx] = 0
        # The target tensor is appended with the padding tokens at the end. These are just dummy tokens added to bring 
        # all the sentences in the batch to the same length. We don't want the model to consider these tokens at all 
        # in the loss calculation. So, we set the probabilities of the entire rows corresponding to the padding tokens
        # to 0.
        mask = unsqueezed_targets.repeat(1, 1, self.vocab_size) == self.padding_idx
        return smoothed_probs.masked_fill(mask=mask, value=0.0)


# Refer to 'step_19_loss_computation.ipynb' notebook to understand how this class works.
class LossCompute:
    def __init__(self):
        # We use the 'sum' reduction to sum the KL Divergence over all the tokens in all the sentences in the batch. 
        # The loss is then averaged over all the tokens in the batch to find the loss per token which is used as the 
        # objective function.         
        self.kl_div_loss = nn.KLDivLoss(reduction="sum")

    # The '__call__' method allows an object of the class to be called just like a function.
    def __call__(self, log_predictions: Tensor, targets: Tensor, num_non_pad_tokens: int) -> Tensor:
        """Computes the KL Divergence loss for the model predictions and the target tensor.

        Args:
            log_predictions (Tensor): The log of the model predictions for the target tokens in the batch.
                                      Each token has a probability distribution over the vocabulary.
                                      shape: [batch_size, seq_len, vocab_size]
            targets (Tensor): The expected target for the model predictions. The target tensor is a smoothed
                              probability distribution over the vocabulary for each token in the batch. 
                              shape: [batch_size, seq_len, vocab_size]
            num_non_pad_tokens (int): The number of non-pad tokens in the target of the batch.

        Returns:
            Tensor: The KL Divergence per token in the batch which is used as the objective function for model
                    training.
        """
        # Calculates the KL Divergence loss between the model predictions and the targets.
        kl_div_loss = self.kl_div_loss(input=log_predictions, target=targets)
        # Calculate the KL Divergence loss per token in the batch.
        return kl_div_loss / num_non_pad_tokens


# Refer to 'step_20_learning_rates.ipynb' notebook to understand how this function works.
def rate(step: int, d_model: int, warmup_steps: int, factor: Optional[float] = 1.0) -> float:
    """This functions implements the above mentioned learning rate schedule. The learning rate is increased linearly
       for the first warmup_steps, and then decreased exponentially for the rest of the training steps. step 
       corresponds to 'epoch' number in the adam_optimizer functionality in pytorch.

    Args:
        step (int): current epoch number in the training loop. starts from 0.
        d_model (int): size of the vectors in the model. This is 512 in the original transformer model.
        warmup_steps (int): number of steps to increase the learning rate linearly.
        factor (Optional[float], optional): factor to scale the learning rate. Defaults to 1.0.

    Returns:
        float: returns the learning rate by which the initial learning rate should be scaled.
    """
    if step == 0:
        step = 1
    return factor * (d_model ** (-0.5) * min(step ** (-0.5), step * warmup_steps ** (-1.5)))


# Refer to 'step_7_data_batching_and_masking.ipynb' notebook to understand how this function works.
def construct_look_ahead_mask(size: int) -> Tensor:
    """Create a mask to prevent the tokens appearing after the current token 
       to attend to the current token or any token before it.

    Args:
        size (int): Size of the mask to be created i.e., the length of the sentence.

    Returns:
        Tensor: A boolean tensor of shape (size, size).
    """
    attention_mask = torch.triu(torch.ones(size, size, dtype=torch.uint8), diagonal=1)
    return attention_mask == 0


# Refer to 'step_7_data_batching_and_masking.ipynb' notebook to understand how this function works. 
def construct_padding_mask(input: Tensor, pad_token_id: int) -> Tensor:
    """Create a mask to prevent the padding tokens from attending to the tokens.

    Args:
        input (Tensor): A batch of sentences of shape (batch_size, seq_len).
        pad_token_id (int): Id of the padding token.

    Returns:
        Tensor: A boolean tensor of shape (batch_size, seq_len, seq_len).
    """
    mask = (input != pad_token_id)
    mask = mask.unsqueeze(1).repeat(1, input.size(1), 1)
    return mask


# Refer to 'step_7_data_batching_and_masking.ipynb' notebook to understand how this class works.
class Batch:
    """Object for holding a batch of data and the corresponding mask to be used for training."""

    def __init__(self, src_batch: Tensor, tgt_batch: Tensor, pad_token_id: int):
        """Initialize the Batch object. Updates the tgt_batch to the format expected by the decoder
           during training. Also, creates the mask for the source and target sentences.

        Args:
            src_batch (Tensor): Tensor containing the source sentences in the batch. 
                                Has the shape [batch_size, seq_len].
            tgt_batch (Tensor): Tensor containing the target sentences in the batch.
                                Has the shape [batch_size, seq_len].
            pad_token_id (int): Id of the pad token appended to the sentences in the batch.
        """
        self.src = src_batch
        # The source sentences only need the padding mask since the Encoder does not have to predict
        # the next token in the sentence but just encode the input to be used by the Decoder.
        # Shape of src_mask: [batch_size, 1, seq_len, seq_len]
        self.src_mask = construct_padding_mask(input=src_batch, pad_token_id=pad_token_id).unsqueeze(1)
        # Removes the last token (<eos> or <pad>) from the target sentences to create the target_decoder_input.
        # Shape of tgt_decoder_input: [batch_size, seq_len - 1]
        self.tgt_decoder_input = tgt_batch[:, :-1]
        # Removes the first token (<sos>) from the target sentences to create the target_expected_decoder_output.
        # Shape of tgt_expected_decoder_output: [batch_size, seq_len - 1]
        self.tgt_expected_decoder_output = tgt_batch[:, 1:]
        # Shape of tgt_mask: [batch_size, 1, seq_len, seq_len]
        self.tgt_mask = self.construct_target_mask(tgt=self.tgt_decoder_input, pad_token_id=pad_token_id)
        # Number of tokens in the target sentences excluding the padding tokens. This is used during model 
        # training for the loss calculation inorder to normalize the total loss and find the loss per token.
        self.non_pad_tokens = (self.tgt_expected_decoder_output != pad_token_id).sum()

    def construct_target_mask(self, tgt: Tensor, pad_token_id: int) -> Tensor:
        # The target sentences need both the padding mask and the look ahead mask. The padding mask is used
        # to prevent the padding tokens from attending to the other tokens in the target sentences. The look
        # ahead mask is used to prevent the future tokens from attending to the current token or any token.
        tgt_mask = construct_padding_mask(input=tgt, pad_token_id=pad_token_id)
        tgt_mask = tgt_mask & construct_look_ahead_mask(tgt.size(-1)).type_as(tgt_mask.data)
        return tgt_mask.unsqueeze(1)


# Refer to 'step_18_label_smoothing.ipynb' notebook to understand how this class works.
class LabelSmoothing(nn.Module):
    def __init__(self, vocab_size: int, padding_idx: int, smoothing: Optional[float]=SMOOTHING_PROB):
        super(LabelSmoothing, self).__init__()
        # Number of classes in the classification problem. It is the size of the vocabulary in transformers.
        self.vocab_size = vocab_size
        # Index of the padding token or the class label for the padding token.
        self.padding_idx = padding_idx
        # Amount of probability to be shared among the tokens excluding correct token and padding tokens.
        self.smoothing = smoothing
        # Amount of probability shared with the correct token.
        self.confidence = 1 - smoothing
    
    def forward(self, targets: Tensor) -> Tensor:
        """Calculates the smoothed probabilities for each of the target tokens within each sentence.

        Args:
            targets (Tensor): The target tensor containing the correct class labels (expected token indices from the 
                              vocab) for each token in the batch. An example target tensor for a batch of 2 sentences
                              each with 8 tokens and 6 possible classes for prediction (including the padding token)
                              would be: [[0, 3, 4, 5, 5, 1, 2, 2], [1, 5, 3, 3, 4, 0, 0, 2]]
                              shape: [batch_size, seq_len]

        Returns:
            Tensor: A smoothed probability distribution (1D tensor) for each target token in the batch.
                    shape: [batch_size, seq_len, vocab_size]                    
        """
        batch_size, seq_len = targets.shape
        # Creating a tensor that will hold the smoothed probabilities for each target token in all the sentences.
        smoothed_probs = torch.zeros(size=(batch_size, seq_len, self.vocab_size), dtype=torch.float32)
        # Filling the entire tensor with the smoothing probability. We will deal with the probabilities of the
        # correct token and padding token later.
        smoothed_probs = smoothed_probs.fill_(value=self.smoothing / (self.vocab_size - 2))
        # Bringing the targets tensor to contain the same number of dimensions as the smoothed_probs tensor to use
        # it with the scatter_ function inorder replace the probabilities in the smoothed_probs tensor in the next 
        # step.
        unsqueezed_targets = targets.unsqueeze(dim=-1)
        # Replacing the probabilities in the smoothed_probs tensor with the confidence probability at the 
        # positions that correspond to the correct class labels (expected output tokens in the target).
        smoothed_probs.scatter_(dim=-1, index=unsqueezed_targets, value=self.confidence)
        # The padding token should not be predicted at all by the model. So, the probability associated with the
        # class label that correspond to the padding token within each target token distribution should be 0. 
        smoothed_probs[:, :, self.padding_idx] = 0
        # The target tensor is appended with the padding tokens at the end. These are just dummy tokens added to bring 
        # all the sentences in the batch to the same length. We don't want the model to consider these tokens at all 
        # in the loss calculation. So, we set the probabilities of the entire rows corresponding to the padding tokens
        # to 0.
        mask = unsqueezed_targets.repeat(1, 1, self.vocab_size) == self.padding_idx
        return smoothed_probs.masked_fill(mask=mask, value=0.0)


# Refer to 'step_6_dataloader_with_transformers.ipynb' to understand how this class works.
def text_extractor(data_point: dict[str, str], language: str) -> str:
    """Extracts the appropriate text from the datapoint based on the language.

    Args:
        data_point (dict[str, str]): Datapoint containing the text in the form of a dictionary.
                                     The sources sentence is stored in the key 'src' and the target sentence 
                                     is stored in the key 'tgt'.
        language (str): Language of the text to be extracted from the data_point.

    Raises:
        ValueError: Raises an error if the language is not 'english' or 'telugu'.

    Returns:
        str: The text in the data_point.
    """
    if language == "english":
        return data_point["src"]
    elif language == "telugu":
        return data_point["tgt"]
    raise ValueError("Language should be either 'english' or 'telugu'.")


# Refer to 'step_4_dataloader_with_transformers.ipynb' to understand how this class works.
class BaseTokenizer(ABC):
    """A class created to hold different kinds of tokenizers and handle the token encoding in a common way.
       Here, we only use SpacyTokenizer and HuggingFaceTokenizer."""
    def __init__(self, language: str, tokenizer_type: str):
        self.language = language
        self.tokenizer_type = tokenizer_type

    # Abstract methods need to be overridden by the child class. It raises TypeError if not overridden.
    @abstractmethod
    def initialize_tokenizer_and_build_vocab(self, data_iterator: datasets.arrow_dataset.Dataset, field_extractor: Callable[[dict[str, str], str], str], max_vocab_size: int = 40000):
        """Initializes the tokenizers and builds the vocabulary for the given dataset.

        Args:
            data_iterator (datasets.arrow_dataset.Dataset): An iterator that gives input sentences (text) when iterated upon.
            field_extractor (Callable[[dict[str, str], str], str]): A function that extracts the appropriate text from the input 
                dataset. This parameter is added to make the tokenizer independent of the input dataset. If not provided as an 
                argument, we will have to extract the text from the dataset within the 'BaseTokenizer' class which makes it 
                dependent on the dataset format. 
            max_vocab_size (int, optional): Maximum size of the vocabulary to create from the input data corpus. Defaults to 40000.
        """
        raise NotImplementedError

    # Abstract methods need to be overridden by the child class. It raises TypeError if not overridden.
    @abstractmethod
    def tokenize(self, text: str) -> list[str]:
        """Returns the individual tokens (language text) for the given text"""
        raise NotImplementedError

    # Abstract methods need to be overridden by the child class. It raises TypeError if not overridden.
    @abstractmethod
    def encode(self, text: str) -> list[str]:
        """Returns the encoded token ids for the given text"""
        raise NotImplementedError
    
    @abstractmethod
    def decode(self, token_ids: list[int]) -> str:
        """Returns the decoded text for the given token ids"""
        raise NotImplementedError

    # Abstract methods need to be overridden by the child class. It raises TypeError if not overridden.
    @abstractmethod
    def get_token_id(self, token: str) -> int:
        """Returns the token id for the given token string"""
        raise NotImplementedError

    @abstractmethod
    def get_vocab_size(self) -> int:
        """Returns the size of the vocabulary i.e., number of tokens in the vocabulary"""
        raise NotImplementedError


# Refer 'step_2_alternate_tokenization_with_spacy.ipynb' notebook to understand this class better.
class SpacyTokenizer(BaseTokenizer):
    """Creats a tokenizer that tokenizes the text using the Spacy tokenizer models."""
    def __init__(self, language: str):
        super().__init__(language, "spacy")
        self.special_tokens = ["<sos>", "<eos>", "<pad>", "<unk>"]
    
    def initialize_tokenizer_and_build_vocab(self, data_iterator: datasets.arrow_dataset.Dataset, text_extractor: Callable[[dict[str, str], str], str], max_vocab_size: int = 40000):
        # Load spacy models for English text tokenization.
        if self.language == "english":
            self.tokenizer = spacy.load("en_core_web_sm").tokenizer          
        elif self.language == "telugu":
            # Load spacy model for Telugu text tokenization.
            self.tokenizer = spacy.blank("te").tokenizer            
        else:
            # Raise an error for unknown language
            pass
        self.max_vocab_size = max_vocab_size
        self.__build_vocab(data_iterator=data_iterator, text_extractor=text_extractor)

    def tokenize(self, text: str) -> list[str]:
        return [token.text for token in self.tokenizer(text)]

    def encode(self, text: str) -> list[int]:
        return self.vocab(self.tokenize(text))
    
    def decode(self, token_ids: list[int]) -> str:
        """Returns the decoded text for the given token ids"""
        return " ".join(self.vocab.lookup_tokens(indices=token_ids))

    def get_token_id(self, token: str) -> int:
        return self.vocab([token])[0]
    
    def get_vocab_size(self) -> int:
        return len(self.vocab)

    def __yield_tokens(self, data_iterator: datasets.arrow_dataset.Dataset, text_extractor: Callable[[dict[str, str], str], str]):
        """Returns a generator object that emits tokens for each sentence in the dataset"""
        for data_point in data_iterator:
            yield self.tokenize(text_extractor(data_point, self.language))

    def __build_vocab(self, data_iterator: datasets.arrow_dataset.Dataset, text_extractor: Callable[[dict[str, str], str], str]):
        """Builds the vocabulary for the given dataset"""
        self.vocab = build_vocab_from_iterator(iterator=self.__yield_tokens(data_iterator=data_iterator, text_extractor=text_extractor), min_freq=2, specials=self.special_tokens, special_first=True, max_tokens=self.max_vocab_size)
        self.vocab.set_default_index(self.vocab["<unknown>"])
    

# Refer to 'step_4_datasets_and_dataloaders_pytorch' to understand how this class works.
class HuggingFaceDatasetWrapper(torch.utils.data.Dataset):
    def __init__(self, hf_dataset: datasets.arrow_dataset.Dataset):
        """Initializes the HuggingFaceDatasetWrapper with the given dataset.

        Args:
            hf_dataset (datasets.arrow_dataset.Dataset): The hugging face dataset to be wrapped.
        """
        self.dataset = hf_dataset
    
    def __len__(self):
        """Extracts the length of the dataset.

        Returns:
            int: Length of the dataset.
        """
        return len(self.dataset)
    
    def __getitem__(self, index):
        """Extracts the data_point at a particular index in the dataset.

        Args:
            index (int): Index of the data_point to be extracted from the dataset.

        Returns:
            dict: Data_point at the given index in the dataset. This turns out to be a dictionary for our dataset but
                  it could be any type in general.
        """
        # Return the dataset at a particular index.
        # The index provided will always be less then length (64 in this case) returned by __len__ function.
        return self.dataset[index]


# Refer to 'step_6_dataloader_with_transformers.ipynb' notebook to understand how this class works.
class LengthAwareSampler(Sampler):
    def __init__(self, dataset: Dataset, batch_size: int):
        # dataset is the Dataset wrapper we created on top of HuggingFace dataset.
        self.dataset = dataset
        self.batch_size = batch_size
        self.sorted_indices = self.extract_lengths()
    
    # We don't want the entire dataset to be loaded into the memory at once. So, we first iterate over the entire 
    # dataset, extract the lengths of the sentences and sort the indices of the sentences according to the sentence 
    # lengths. This is to ensure that sentences of similar lengths are grouped together in a batch to minimize the 
    # overall padding necessary. When we iterate over the dataset, we only load the necessary data from the dataset 
    # into memory and not the entire dataset at once --> This loading logic could be a little different based on the 
    # hugging face implementations. Please look into the hugging face documentation for more details.
    def extract_lengths(self) -> list[int]:
        """Sorts the indices of the dataset based on the lengths of the sentences in the dataset.

        Returns:
            list[int]: Indices of the dataset sorted in ascending order (small to big) based on the lengths of 
                       the sentences in the dataset.
        """
        # Note that the lengths are calculated based on the number of characters in the sentence and not the number
        # of tokens.
        self.lengths = [len(data_point["src"].split(" ")) + len(data_point["tgt"].split(" ")) for data_point in self.dataset]
        return sorted(range(len(self.dataset)), key=lambda index: self.lengths[index])

    # The __iter__ function is called once per epoch. The returned iterator is iterated on to get the list of indices 
    # for the data points in a batch.
    def __iter__(self) -> Generator[list[int], None, None]:
        """Provides an iterator that yields the indices of the dataset in the order of the sentence lengths.

        Returns:
            Generator: A generator object that yields the indices of the dataset in the order of the sentence lengths.

        Yields:
            Generator[list[int], None, None]: A generator object that yields the indices of the dataset in the order
                                              of the sentence lengths.
        """
        # Create the batches of indices based on the sentence lengths. 
        # batches look like: [[0, 5, 90], [23, 4, 5], ...] if batch_size is 3.
        # [0, 5, 90] is a batch corresponding to the sentences at indices 0, 5 and 90 in the original dataset.
        # [23, 4, 5] is a batch corresponding to the sentences at indices 23, 4 and 5 in the original dataset.
        batches = [self.lengths[index: index + self.batch_size] for index in range(0, len(self.dataset), self.batch_size)]
        # Shuffle the batches to ensure that the order of batches is different in every epoch. We want the model to 
        # see the data in different order in every epoch. So, we shuffle the order of the batches within the dataset.
        random.shuffle(batches)
        # Flatten the list of batches to get an iterable of indices. At the end, the dataloader expects an iterable of
        # indices to get the data points from the dataset. So, we convert the list of batches back to an iterable of 
        # indices.
        return iter([index for batch in batches for index in batch])


# Refer to 'step_6_dataloader_with_transformers.ipynb' notebook to understand how this class works.
def length_aware_collate_fn(batch, english_tokenizer: BaseTokenizer, telugu_tokenizer: BaseTokenizer, sos_id: int = 0, eos_id: int = 1, pad_id: int = 2):
    """Converts the raw data in the batch into the format required by the MachineTranslationTransformer model. It encodes the
       sentences into token ids, adds start, end and padding tokens and batches the converted data back to be used by the 
       model.

    Args:
        batch (_type_): Holds the raw data points (the actual english (src) and telugu (tgt) sentences batched) from the dataset.
        english_tokenizer (BaseTokenizer): Tokenizer to tokenize and encode the english sentences into corresponding token ids.
        telugu_tokenizer (BaseTokenizer): Tokenizer to tokenize and encode the english sentences into corresponding token ids.
        sos_id (int, optional): start of sentence token id. Defaults to 0.
        eos_id (int, optional): end of sentence token id. Defaults to 1.
        pad_id (int, optional): padding token id. Defaults to 2.

    Returns:
        Tuple[Tensor, Tensor]: Returns the encoded source and target tensors in the batch which can be used by the transformer model
                               as input.
    """
    # Holds all the encoded src sentences (english sentences) from the batch. encoded sentence means sentence divided 
    # into tokens and tokens converted into their integer ids.
    # [[0, 223, 4345, 545, 1], [0, 23, 234, 67, 1]] is an example for the processed_src_sentences variable where
    # [0, 223, 4345, 545, 1] represents an ecodeded sentence from the batch and 0 at the start is <sos> and 1 at the 
    # end is <eos>. 
    processed_src_sentences = []
    # Holds all the encoded tgt sentences (telugu sentences) from the batch.
    processed_tgt_sentences = []
    for data_point in batch:
        # src is english sentence.
        src_sentence = data_point["src"]
        # tgt is telugu sentence.
        tgt_sentence = data_point["tgt"]
        # start of sentence id to append at the start of every sentence.
        sos_tensor = torch.tensor([sos_id], dtype=torch.int64)
        # end of sentence id to append at the end of every sentence.
        eos_tensor = torch.tensor([eos_id], dtype=torch.int64)
        # It is important to set the dtype to 'torch.int64' because we map token_ids to their embeddings in the transformer model.
        # '<sos>' and '<eos>' tokens are not added to the src sentences. They are only added to the target sentences.
        encoded_src_sentence = torch.tensor(english_tokenizer.encode(src_sentence), dtype=torch.int64)
        # prepares the tensor in the format 'token_id(<sos>) token_id1 token_id2 ... last_token_id token_id(<eos>)'. 
        encoded_tgt_sentence = torch.cat([sos_tensor, torch.tensor(telugu_tokenizer.encode(tgt_sentence), dtype=torch.int64), eos_tensor], dim=0)
        processed_src_sentences.append(encoded_src_sentence)
        processed_tgt_sentences.append(encoded_tgt_sentence)
    # find the maximum length (max_len) of the sentences in the batch so that sentences are padded (if needed) to get all the
    # sentences to the same length i.e., max_len.
    max_len = max(max(src_ids.size(0) for src_ids in processed_src_sentences), max(tgt_ids.size(0) for tgt_ids in processed_tgt_sentences))
    # We pad the sentences with pad token so that every sentence in the batch is of same length. Also, notice 
    # that the pad token is appended after (not before) the <eos> token is appended to every sentence.
    src_ids = [torch.nn.functional.pad(input=src_ids, pad=(0, max_len - src_ids.size(0)), mode="constant", value=pad_id) for src_ids in processed_src_sentences]
    tgt_ids = [torch.nn.functional.pad(input=tgt_ids, pad=(0, max_len - tgt_ids.size(0)), mode="constant", value=pad_id) for tgt_ids in processed_tgt_sentences]
    # stack the src tensors along dimension 0. This then becomes a 2D tensor of shape (BATCH_SIZE, MAX_SENTENCE_LENGTH).
    src = torch.stack(tensors=src_ids, dim=0)
    tgt = torch.stack(tensors=tgt_ids, dim=0)
    return (src, tgt)

# Model Training

In [8]:
@dataclass
class TrainState:
    epoch_num: int
    learning_rate: float
    training_loss: float
    num_tokens_processed: int
    model_checkpoint_path: str
    validation_loss: Optional[float] = None

class ModelState:
    def __init__(self):
        self.state = []

    def append_state(self, train_state: TrainState):
        self.state.append(train_state)
    
    def plot_loss_variation(self):
        pass

In [9]:
def PrintTrainState(train_state: TrainState):
    print("epoch_num: ", train_state.epoch_num)
    print("learning_rate: ", train_state.learning_rate)
    print("num_tokens_processed until this epoch: ", train_state.num_tokens_processed)
    print("model_checkpoint_path: ", train_state.model_checkpoint_path)
    print("training_loss: ", train_state.training_loss)
    print("validation_loss: ", train_state.validation_loss)
    print("-" * 150)

In [10]:
def create_tokenizers(train_dataset_path: str) -> Tuple[BaseTokenizer, BaseTokenizer]:
    translation_train_dataset = datasets.load_from_disk(train_dataset_path)
    telugu_tokenizer: BaseTokenizer = SpacyTokenizer(language="telugu")
    telugu_tokenizer.initialize_tokenizer_and_build_vocab(data_iterator=translation_train_dataset, text_extractor=text_extractor, max_vocab_size=SRC_VOCAB_SIZE)
    english_tokenizer: BaseTokenizer = SpacyTokenizer(language="english")
    english_tokenizer.initialize_tokenizer_and_build_vocab(data_iterator=translation_train_dataset, text_extractor=text_extractor, max_vocab_size=TGT_VOCAB_SIZE)
    return english_tokenizer, telugu_tokenizer

In [22]:
def create_dataloader(dataset_path: str, english_tokenizer: BaseTokenizer, telugu_tokenizer: BaseTokenizer):
    # Load the train dataset we have created and saved to disk in step_1_data_exploration.ipynb. 
    translation_dataset = datasets.load_from_disk(dataset_path)
    wrapped_translation_dataset = HuggingFaceDatasetWrapper(hf_dataset=translation_dataset)
    def collate_fn(batch):
        """The collate_fn is called by the DataLoader with just the batch of data points from the dataset. So, we wrap the 
        length_aware_collate_fn function with the required parameters to create a collate_fn that can be used by the 
        DataLoader.

        Args:
            batch (_type_): Batch of raw data points from the dataset.

        Returns:
            _type_: Returns the batch of data points in the format required by the MachineTranslationTransformer model.
        """
        sos_id = english_tokenizer.get_token_id(token="<sos>")
        eos_id = english_tokenizer.get_token_id(token="<eos>")
        pad_id = telugu_tokenizer.get_token_id(token="<pad>")
        return length_aware_collate_fn(batch=batch, 
                                       english_tokenizer=english_tokenizer, 
                                       telugu_tokenizer=telugu_tokenizer, 
                                       sos_id=sos_id, 
                                       eos_id=eos_id, 
                                       pad_id=pad_id)
    length_aware_sampler = LengthAwareSampler(dataset=wrapped_translation_dataset, batch_size=32)
    return DataLoader(dataset=wrapped_translation_dataset, batch_size=128, sampler=length_aware_sampler, num_workers=0, collate_fn=collate_fn)

In [12]:
def run_epoch(machine_translation_model: MachineTranslationTransformer, 
              train_dataLoader: DataLoader, 
              optimizer: torch.optim.Optimizer, 
              lr_scheduler: torch.optim.lr_scheduler._LRScheduler,
              loss_compute: LossCompute,
              label_smoothing: LabelSmoothing,
              epoch_num: int,
              pad_token_id: Optional[int] = PAD_TOKEN_ID) -> TrainState:
    # Holds the learning rate at the start of the epoch.
    cur_learning_rate = lr_scheduler.get_last_lr()
    num_tokens_processed = 0
    for src_batch, tgt_batch in train_dataLoader:
        # zero out the gradients.
        optimizer.zero_grad()
        # Create Batch from the input.
        batch = Batch(src_batch=src_batch, tgt_batch=tgt_batch, pad_token_id=pad_token_id)
        num_tokens_processed += batch.non_pad_tokens
        # Forward pass of the machine translation model. Returns the predicted probability distributions for each token
        # in the target sentences.
        predicted_tgt_log_probability_distributions = machine_translation_model(src=batch.src, 
                                                                                tgt=batch.tgt_decoder_input, 
                                                                                src_mask=batch.src_mask, 
                                                                                tgt_mask=batch.tgt_mask)
        # Applies label smoothing to the expected token ids.
        expected_tgt_probability_distributions = label_smoothing(targets=batch.tgt_expected_decoder_output)
        # Computes the KLDivergence loss between predicted and expected outputs.
        loss = loss_compute(log_predictions=predicted_tgt_log_probability_distributions, 
                            targets=expected_tgt_probability_distributions, 
                            num_non_pad_tokens=int(batch.non_pad_tokens.item()))
        # Computes the gradients wrt to the loss.
        loss.backward()
        # Updates the weights with the calculated gradients.
        optimizer.step()
        # Updates the learning rate.
        lr_scheduler.step()
    # Save the model to disk.    
    model_checkpoint_path = f"{MODEL_CHECK_POINTS_PATH}/epoch_{epoch_num}_translation_model.pt"
    torch.save(machine_translation_model.state_dict(), model_checkpoint_path)
    train_state = TrainState(epoch_num=epoch_num, 
                             learning_rate=cur_learning_rate, 
                             training_loss=loss.item(), 
                             num_tokens_processed=num_tokens_processed, 
                             model_checkpoint_path=model_checkpoint_path)
    return train_state

In [13]:
def train_model(num_epochs: int, 
                english_tokenizer: BaseTokenizer, 
                telugu_tokenizer: BaseTokenizer, 
                pad_token_id: Optional[int]=PAD_TOKEN_ID) -> Tuple[MachineTranslationTransformer, ModelState]:
    translation_model = MachineTranslationTransformer(d_model=D_MODEL, 
                                                      d_feed_forward=D_FEED_FORWARD,
                                                      dropout_prob=DROPOUT_PROB, 
                                                      num_heads=NUM_HEADS, 
                                                      src_vocab_size=english_tokenizer.get_vocab_size(), 
                                                      tgt_vocab_size=telugu_tokenizer.get_vocab_size(), 
                                                      num_layers=NUM_LAYERS, 
                                                      max_seq_len=MAX_SEQ_LEN)
    adam_optimizer = torch.optim.Adam(params=translation_model.parameters(), lr=INITIAL_LEARNING_RATE, betas=(BETA_1, BETA_2), eps=EPSILON)
    rate_lambda = lambda step: rate(step, d_model=D_MODEL, warmup_steps=NUM_WARMUP_STEPS, factor=1.0)
    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=adam_optimizer, lr_lambda=rate_lambda)
    label_smoothing = LabelSmoothing(vocab_size=TGT_VOCAB_SIZE, padding_idx=PAD_TOKEN_ID, smoothing=SMOOTHING_PROB)
    loss_compute = LossCompute()
    model_state = ModelState()
    train_dataloader = create_dataloader(dataset_path=f"{AI4_BHARAT_DATA_PATH}/mini_train_dataset", english_tokenizer=english_tokenizer, telugu_tokenizer=telugu_tokenizer)
    for epoch in range(num_epochs):
        train_state = run_epoch(machine_translation_model=translation_model, 
                                train_dataLoader=train_dataloader, 
                                optimizer=adam_optimizer, 
                                lr_scheduler=lr_scheduler, 
                                loss_compute=loss_compute, 
                                label_smoothing=label_smoothing, 
                                epoch_num=epoch, 
                                pad_token_id=pad_token_id)
        model_state.append_state(train_state)
        PrintTrainState(train_state=train_state)
    return translation_model, model_state

In [18]:
english_tokenizer, telugu_tokenizer = create_tokenizers(train_dataset_path=f"{AI4_BHARAT_DATA_PATH}/mini_train_dataset")
# translation_model, model_state = train_model(num_epochs=NUM_EPOCHS, english_tokenizer=english_tokenizer, telugu_tokenizer=telugu_tokenizer, pad_token_id=PAD_TOKEN_ID)

In [29]:
train_dataloader = create_dataloader(dataset_path=f"{AI4_BHARAT_DATA_PATH}/mini_train_dataset", english_tokenizer=english_tokenizer, telugu_tokenizer=telugu_tokenizer)
data_iterator = iter(train_dataloader)
# batched_data = next(data_iterator)
index = 1
for batched_data in data_iterator:
    # print("shapes: ", batched_data[0].shape, batched_data[1].shape)
    if index == 34:
        print(batched_data[0].shape)
        print(batched_data)
    index += 1

torch.Size([128, 29])
(tensor([[  32,   11,  241,  ...,    2,    2,    2],
        [  12,  101,   18,  ...,    2,    2,    2],
        [ 720, 1742,   16,  ...,    2,    2,    2],
        ...,
        [1074,  344,   27,  ...,    2,    2,    2],
        [1089, 2361,   17,  ...,    2,    2,    2],
        [  12,   87,   40,  ...,    2,    2,    2]]), tensor([[   0,    3,    4,  ...,    2,    2,    2],
        [   0,    6, 1857,  ...,    2,    2,    2],
        [   0, 6323, 1191,  ...,    2,    2,    2],
        ...,
        [   0,    6,  256,  ...,    2,    2,    2],
        [   0,  203,  542,  ...,    2,    2,    2],
        [   0, 8902, 6946,  ...,    2,    2,    2]]))
