In [1]:
import os
import math
import torch
import pandas as pd
import numpy as np
from torch import nn
from torch.nn import Dropout
from torch.optim import Optimizer
import torch.nn.functional as F
from transformers import XLMTokenizer
from collections import Counter
from typing import List, Tuple, Dict, Callable, Optional
from copy import deepcopy

In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
PAD = 0
UNK = 1
MAX_VOCAB_SIZE = 50000
SPECIAL_TOKENS_OFFSET = 2
NTRAIN = None
MAX_LENGTH = 100
BATCH_SIZE = 128
NEPOCH = 100

In [3]:
data =pd.read_csv("data/translation/en2fr.csv", index_col=0)

if NTRAIN is None:
    data = data.reset_index(drop=True)
else:
    data = data.reset_index(drop=True).iloc[0:NTRAIN]

tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
num_examples=len(data)

In [4]:
print(f'Number of examples available is {num_examples}.')

Number of examples available is 47173.


In [5]:
def create_dictionary(data: pd.DataFrame, tokenizer, language: str = 'en') -> Tuple[Dict[str, int], Dict[int, str]]:
    """
    Creates a word-to-index and index-to-word dictionary from tokenized sentences in a specified language.

    Args:
        data (pd.DataFrame): A pandas DataFrame containing text data. The column with sentences should be named according to the language.
        tokenizer: A tokenizer object with a `tokenize` method to tokenize each sentence.
        language (str): The language code for the column in the DataFrame to be tokenized (default is 'en' for English).

    Returns:
        Tuple[Dict[str, int], Dict[int, str]]: 
        - word_dict: A dictionary mapping each word to its unique index.
        - idx_dict: A dictionary mapping indices back to words.
    """
    # Extract sentences from the specified language column
    sentences = data[language]
    
    # Tokenize sentences, adding 'BOS' (Beginning of Sentence) and 'EOS' (End of Sentence) tokens
    tokens = [["BOS"] + tokenizer.tokenize(sentence) + ["EOS"] for sentence in sentences]
    
    # Count the frequency of each word in the tokenized sentences
    word_count = Counter(word for sentence in tokens for word in sentence)
    
    # Get the most common words up to the maximum vocabulary size
    most_common_words = word_count.most_common(MAX_VOCAB_SIZE)
    
    # Create word-to-index dictionary, starting from the offset to reserve space for special tokens
    word_dict = {word: idx + SPECIAL_TOKENS_OFFSET for idx, (word, _) in enumerate(most_common_words)}
    
    # Add special tokens to the dictionary
    word_dict["PAD"] = PAD
    word_dict["UNK"] = UNK
    
    # Create index-to-word dictionary (inverse of word_dict)
    idx_dict = {idx: word for word, idx in word_dict.items()}
    
    return tokens, word_dict, idx_dict


def idx_to_sentence(ids: List[int], idx_dict: Dict[int, str]) -> str:
    """
    Converts a list of word indices into a human-readable sentence using a dictionary mapping indices to words.

    Args:
        ids (List[int]): A list of integer word indices.
        idx_dict (Dict[int, str]): A dictionary mapping word indices to their corresponding words.

    Returns:
        str: The reconstructed sentence from the list of indices, with appropriate spacing and punctuation.
    """
    # Convert index list to words, defaulting to "UNK" for missing indices
    tokens = [idx_dict.get(i, "UNK") for i in ids]
    
    # Join tokens into a string and replace special tokens with spaces
    sentence = "".join(tokens).replace("</w>", " ")

    # Remove spaces before punctuation
    punctuation = '''?:;.,'("-!&)%'''
    for punct in punctuation:
        sentence = sentence.replace(f" {punct}", punct)

    # Return the cleaned sentence, ensuring no trailing whitespace
    return sentence.rstrip()


def seq_padding(X, padding=0):
    """
    Pads each sequence in the input list to the length of the longest sequence.

    This function takes a list of sequences (lists or arrays) and pads each sequence
    with a specified value (default is 0) so that all sequences have the same length. 
    The length of the padded sequences will be equal to the length of the longest sequence
    found in the input list.

    Parameters:
    X (list of list/array): A list containing sequences (lists or numpy arrays).
    padding (int, optional): The value to use for padding shorter sequences. Default is 0.

    Returns:
    np.ndarray: A 2D numpy array where each row corresponds to a padded sequence.
    """
    
    # Calculate the lengths of each sequence
    lengths = [len(x) for x in X]
    
    # Determine the maximum length
    max_length = max(lengths)
    
    # Pad sequences to the maximum length
    padded_sequences = np.array([
        np.concatenate([x, [padding] * (max_length - len(x))]) if len(x) < max_length else x 
        for x in X
    ])
    
    return padded_sequences

class Batch:
    """
    A class to represent a batch of source and target sequences for processing in a neural network.

    The Batch class handles the conversion of numpy arrays to PyTorch tensors and creates masks for the
    source and target sequences. It also computes the number of tokens in the target sequences that 
    are not padding.

    Attributes:
        src (torch.Tensor): The source sequences as a tensor.
        src_mask (torch.Tensor): A mask indicating the positions of the valid tokens in the source sequences.
        trg (torch.Tensor, optional): The target sequences (excluding the last token).
        trg_y (torch.Tensor, optional): The target sequences (excluding the first token).
        trg_mask (torch.Tensor, optional): A mask indicating the positions of valid tokens in the target sequences.
        ntokens (int, optional): The number of valid tokens in the target sequences.

    Parameters:
        src (numpy.ndarray): A numpy array representing the source sequences.
        trg (numpy.ndarray, optional): A numpy array representing the target sequences. Defaults to None.
        pad (int, optional): The padding token. Defaults to 0.
    """

    def __init__(self, src, trg=None, pad=0):
        
        # Convert numpy arrays to PyTorch tensors and move to the specified device
        self.src = torch.from_numpy(src).to(DEVICE).long()
        trg = torch.from_numpy(trg).to(DEVICE).long()
        
        # Create a mask for the source sequences
        self.src_mask = (self.src != pad).unsqueeze(-2)

        if trg is not None:
            # Convert target sequences to tensor and exclude the last token for input
            self.trg = trg[:, :-1]
            
            # Exclude the first token for output
            self.trg_y = trg[:, 1:]
            
            # Create a mask for the target sequences
            self.trg_mask = make_std_mask(self.trg, pad)
            
            # Count the number of valid tokens in the target sequences
            self.ntokens = (self.trg_y != pad).sum().item()
            
            
def subsequent_mask(sequence_length):
    """
    Generates a subsequent mask for self-attention in transformer models.

    This function creates a mask that prevents attention to future tokens in the sequence. 
    The mask is used in self-attention layers to ensure that each position can only attend
    to itself and the previous positions, enforcing an autoregressive property.

    Parameters:
        sequence_length (int): The length of the input sequence for which to generate the mask.

    Returns:
        torch.Tensor: A tensor of shape (1, sequence_length, sequence_length) where positions
                      corresponding to future tokens are masked (set to False) and positions 
                      corresponding to the current and past tokens are unmasked (set to True).
    """
    
    # Create a triangular mask with ones above the main diagonal
    mask_shape = (1, sequence_length, sequence_length)
    upper_triangle_mask = np.triu(np.ones(mask_shape), k=1).astype('uint8')
    
    # Convert the numpy array to a PyTorch tensor and create a boolean mask
    boolean_mask = torch.from_numpy(upper_triangle_mask) == 0
    
    return boolean_mask

def make_std_mask(target_tensor, pad_token):
    """
    Creates a standard mask for the target sequences in transformer models.

    This function generates a mask that allows attention to valid tokens in the target 
    sequence while masking out padding tokens and future tokens. The resulting mask can 
    be used in self-attention layers to ensure proper behavior during training and inference.

    Parameters:
        target_tensor (torch.Tensor): The tensor representing the target sequences.
        pad_token (int): The token value used for padding in the target sequences.

    Returns:
        torch.Tensor: A boolean mask of shape (batch_size, 1, target_length) where valid tokens are
                      marked as True and masked tokens (padding and future tokens) are marked as False.
    """
    
    # Create a mask to identify non-padding tokens
    valid_token_mask = (target_tensor != pad_token).unsqueeze(-2)
    
    # Create a subsequent mask to prevent attention to future tokens
    future_token_mask = subsequent_mask(target_tensor.size(-1)).type_as(valid_token_mask.data)
    
    # Combine the valid token mask with the future token mask
    combined_mask = valid_token_mask & future_token_mask
    
    return combined_mask

def create_batches(english_word_dict, english_tokens, french_word_dict, french_tokens):
    """
    Creates batches of tokenized English and French sentences for training.

    This function converts lists of tokenized sentences into numerical IDs using the provided 
    word dictionaries. It then sorts the sentences by length, shuffles the indices, and 
    groups the sentences into batches of a specified size. Finally, it pads the sequences 
    within each batch to ensure uniformity.

    Parameters:
        english_word_dict (dict): A dictionary mapping English words to their corresponding IDs.
        english_tokens (list of list of str): A list of tokenized English sentences.
        french_word_dict (dict): A dictionary mapping French words to their corresponding IDs.
        french_tokens (list of list of str): A list of tokenized French sentences.

    Returns:
        list: A list of Batch objects, each containing padded English and French sequences.
    """
    
    # Convert English tokens to IDs using the provided dictionary, using 1 for unknown words
    english_ids = [[english_word_dict.get(word, 1) for word in sentence] for sentence in english_tokens]
    
    # Convert French tokens to IDs using the provided dictionary, using 1 for unknown words
    french_ids = [[french_word_dict.get(word, 1) for word in sentence] for sentence in french_tokens]
    
    # Sort the IDs by the length of the sentences
    sorted_indices = sorted(range(len(english_ids)), key=lambda index: len(english_ids[index]))
    english_ids = [english_ids[index] for index in sorted_indices]
    french_ids = [french_ids[index] for index in sorted_indices]

    # Create a list of shuffled indices for batching
    index_list = np.arange(0, len(english_tokens), BATCH_SIZE)
    np.random.shuffle(index_list)

    batch_indices = []
    for index in index_list:
        batch_indices.append(np.arange(index, min(len(english_tokens), index + BATCH_SIZE)))

    batches = []
    for indices in batch_indices:
        batch_english = [english_ids[index] for index in indices]
        batch_french = [french_ids[index] for index in indices]
        
        # Pad the sequences to the maximum length in the batch
        padded_batch_english = seq_padding(batch_english)
        padded_batch_french = seq_padding(batch_french)
        
        # Create a Batch object with the padded sequences
        batches.append(Batch(padded_batch_english, padded_batch_french))

    return batches


class Embeddings(nn.Module):
    """
    A class to create word embeddings using PyTorch's nn.Embedding.

    This class generates dense vector representations for words in a vocabulary. 
    It initializes an embedding layer and scales the output by the square root of the model dimension.

    Attributes:
        embedding_layer (nn.Embedding): The embedding layer mapping vocabulary indices to dense vectors.
        model_dimension (int): The dimensionality of the embedding space.
    
    Parameters:
        d_model (int): The size of each embedding vector.
        vocab_size (int): The number of unique tokens in the vocabulary.
    """

    def __init__(self, model_dimension, vocab_size):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, model_dimension)
        self.model_dimension = model_dimension

    def forward(self, input_indices):
        """
        Forward pass for generating embeddings.

        This method takes a tensor of input indices and retrieves their corresponding 
        embedding vectors from the embedding layer. The output is scaled by the square 
        root of the model dimension to stabilize gradients.

        Parameters:
            input_indices (torch.Tensor): A tensor containing the indices of the words to be embedded.

        Returns:
            torch.Tensor: A tensor of shape (batch_size, sequence_length, d_model) containing 
                          the scaled embeddings.
        """
        embeddings = self.embedding_layer(input_indices) * math.sqrt(self.model_dimension)
        return embeddings
    
class PositionalEncoding(nn.Module):
    """
    A class to apply positional encoding to input sequences.

    Positional encoding is used in transformer models to provide information about the position of each token
    in the sequence. This implementation generates sinusoidal positional encodings and applies dropout for regularization.

    Attributes:
        dropout_layer (nn.Dropout): A dropout layer for regularization.
        positional_encodings (torch.Tensor): Precomputed positional encodings.

    Parameters:
        model_dimension (int): The size of each embedding vector (d_model).
        dropout_prob (float): The dropout probability for regularization.
        max_length (int, optional): The maximum length of input sequences. Defaults to 5000.
    """

    def __init__(self, model_dimension, dropout_prob, max_length=5000):
        super().__init__()
        self.dropout_layer = nn.Dropout(p=dropout_prob)
        
        # Create a tensor for positional encodings
        positional_encodings = torch.zeros(max_length, model_dimension, device=DEVICE)
        
        # Compute the positional indices
        position = torch.arange(0., max_length, device=DEVICE).unsqueeze(1)

        # Calculate the divisor term for sine and cosine functions
        div_term = torch.exp(torch.arange(0., model_dimension, 2, device=DEVICE) * 
                             -(math.log(10000.0) / model_dimension))
        
        # Compute the positional encodings using sine and cosine functions
        positional_encodings_pos = torch.mul(position, div_term)
        positional_encodings[:, 0::2] = torch.sin(positional_encodings_pos)
        positional_encodings[:, 1::2] = torch.cos(positional_encodings_pos)
        
        # Add a batch dimension
        positional_encodings = positional_encodings.unsqueeze(0)
        
        # Register positional encodings as a persistent buffer
        self.register_buffer('positional_encodings', positional_encodings)

    def forward(self, input_tensor):
        """
        Forward pass to add positional encodings to input sequences.

        This method adds the precomputed positional encodings to the input tensor, 
        scales it with dropout for regularization, and returns the resulting tensor.

        Parameters:
            input_tensor (torch.Tensor): A tensor containing the input embeddings of shape 
                                          (batch_size, sequence_length, d_model).

        Returns:
            torch.Tensor: A tensor of the same shape as input_tensor, with positional encodings added.
        """
        # Add positional encodings to the input tensor
        input_tensor = input_tensor + self.positional_encodings[:, :input_tensor.size(1)].requires_grad_(False)
        
        # Apply dropout
        output_tensor = self.dropout_layer(input_tensor)
        return output_tensor

## 1: Data Processing

In [6]:
en_tokens, en_word_dict, en_idx_dict = create_dictionary(data, tokenizer, language='en')
fr_tokens, fr_word_dict, fr_idx_dict = create_dictionary(data, tokenizer, language='fr')

In [7]:
tokenized_en = tokenizer.tokenize("how are you?")
tokenized_fr = tokenizer.tokenize("comment etes-vous?")

ids_en = [en_word_dict.get(i,UNK) for i in tokenized_en] 
ids_fr = [fr_word_dict.get(i,UNK) for i in tokenized_fr] 

In [8]:
idx_to_sentence(ids_en, en_idx_dict)

'how are you?'

In [9]:
idx_to_sentence(ids_fr, fr_idx_dict)

'comment etes-vous?'

## 2: Batch Creation

In [10]:
batches = create_batches(en_word_dict, en_tokens, fr_word_dict, fr_tokens)

## 3: Word Embeddings

In [11]:
src_vocab = len(en_word_dict)
tgt_vocab = len(fr_word_dict)
print(f"There are {src_vocab} distinct En tokens.")
print(f"There are {tgt_vocab} distinct Fr tokens.")

There are 11055 distinct En tokens.
There are 11239 distinct Fr tokens.


In [12]:
example = PositionalEncoding(256, 0.1)
inputs = torch.zeros(1, 8, 256).to(DEVICE)
outputs = example.forward(inputs)
print(f"The shape of positional encoding is {outputs.shape}")

The shape of positional encoding is torch.Size([1, 8, 256])


## 4: Transformer

In [13]:
def attention(query: torch.Tensor, 
              key: torch.Tensor, 
              value: torch.Tensor, 
              mask: Optional[torch.Tensor] = None, 
              dropout_layer: Optional[Dropout] = None) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Computes the scaled dot-product attention.

    This function calculates the attention scores by performing a dot product between 
    the query and key tensors, applies an optional mask to the scores, and generates 
    the attention weights. It then uses these weights to compute the weighted sum of the 
    value tensor. Dropout can also be applied to the attention weights for regularization.

    Parameters:
        query (torch.Tensor): The query tensor of shape (batch_size, num_heads, seq_length, d_k).
        key (torch.Tensor): The key tensor of shape (batch_size, num_heads, seq_length, d_k).
        value (torch.Tensor): The value tensor of shape (batch_size, num_heads, seq_length, d_v).
        mask (Optional[torch.Tensor], optional): A mask tensor to prevent attention to certain positions.
                                                  Should have shape (batch_size, 1, seq_length, seq_length).
        dropout_layer (Optional[Dropout], optional): A dropout layer to apply to the attention weights.
                                                      If None, no dropout is applied.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - The output tensor of shape (batch_size, num_heads, seq_length, d_v) which is the result of 
              the weighted sum of the value tensor.
            - The attention weights tensor of shape (batch_size, num_heads, seq_length, seq_length).
    """
    
    # Get the dimensionality of the keys
    depth_key = query.size(-1)
    
    # Compute the attention scores
    attention_scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(depth_key)
    
    # Apply the mask if provided
    if mask is not None:
        attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
    
    # Calculate the attention weights
    attention_weights = F.softmax(attention_scores, dim=-1)
    
    # Apply dropout if specified
    if dropout_layer is not None:
        attention_weights = dropout_layer(attention_weights)
    
    # Compute the output as the weighted sum of the value tensor
    output = torch.matmul(attention_weights, value)
    
    return output, attention_weights


class MultiHeadedAttention(nn.Module):
    """
    Implements multi-headed attention mechanism.

    This class utilizes multiple attention heads to compute attention scores in parallel, 
    allowing the model to focus on different parts of the input sequence. The attention 
    outputs from each head are concatenated and linearly transformed.

    Attributes:
        num_heads (int): The number of attention heads.
        depth_per_head (int): The dimensionality of each attention head.
        linear_layers (nn.ModuleList): A list of linear transformation layers.
        attention_weights (torch.Tensor, optional): The attention weights from the last forward pass.
        dropout_layer (nn.Dropout): Dropout layer for regularization.

    Parameters:
        num_heads (int): The number of attention heads.
        model_dimension (int): The dimensionality of the input and output (d_model).
        dropout_prob (float): The dropout probability for regularization.
    """

    def __init__(self, num_heads: int, model_dimension: int, dropout_prob: float = 0.1):
        super().__init__()
        assert model_dimension % num_heads == 0, "Model dimension must be divisible by the number of heads."
        
        self.depth_per_head = model_dimension // num_heads
        self.num_heads = num_heads
        
        # Create linear layers for query, key, value, and output transformations
        self.linear_layers = nn.ModuleList([deepcopy(nn.Linear(model_dimension, model_dimension)) for _ in range(4)])
        self.attention_weights = None
        self.dropout_layer = nn.Dropout(p=dropout_prob)

    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Forward pass for multi-headed attention.

        This method computes the attention output for the provided query, key, and value tensors.
        It applies the attention mechanism, utilizes an optional mask, and combines the results
        from multiple attention heads.

        Parameters:
            query (torch.Tensor): The query tensor of shape (batch_size, seq_length, d_model).
            key (torch.Tensor): The key tensor of shape (batch_size, seq_length, d_model).
            value (torch.Tensor): The value tensor of shape (batch_size, seq_length, d_model).
            mask (Optional[torch.Tensor]): An optional mask tensor to prevent attending to certain positions.

        Returns:
            torch.Tensor: The output tensor after applying multi-headed attention of shape 
                          (batch_size, seq_length, d_model).
        """
        
        if mask is not None:
            mask = mask.unsqueeze(1)  # Add a dimension for multi-head attention

        batch_size = query.size(0)

        # Apply linear transformations and reshape for multi-head attention
        query, key, value = [
            linear_layer(x).view(batch_size, -1, self.num_heads, self.depth_per_head).transpose(1, 2)
            for linear_layer, x in zip(self.linear_layers, (query, key, value))
        ]

        # Calculate attention using the helper function
        attention_output, self.attention_weights = attention(query, key, value, mask=mask, dropout_layer=self.dropout_layer)
        
        # Reshape and concatenate the outputs from all heads
        output = attention_output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.depth_per_head)
        
        # Apply the final linear transformation
        output = self.linear_layers[-1](output)
        
        return output
    
class PositionwiseFeedForward(nn.Module):
    """
    Implements a position-wise feed-forward neural network.

    This class represents a feed-forward network used in transformer models. 
    It consists of two linear transformations with a dropout layer in between. 
    The first linear transformation increases the dimensionality, while the second 
    reduces it back to the original model dimension.

    Attributes:
        linear_layer1 (nn.Linear): The first linear transformation layer.
        linear_layer2 (nn.Linear): The second linear transformation layer.
        dropout_layer (nn.Dropout): Dropout layer for regularization.

    Parameters:
        model_dimension (int): The dimensionality of the input (d_model).
        feedforward_dimension (int): The dimensionality of the feed-forward layer (d_ff).
        dropout_prob (float): The dropout probability for regularization.
    """

    def __init__(self, model_dimension: int, feedforward_dimension: int, dropout_prob: float = 0.1):
        super().__init__()
        self.linear_layer1 = nn.Linear(model_dimension, feedforward_dimension)
        self.linear_layer2 = nn.Linear(feedforward_dimension, model_dimension)
        self.dropout_layer = nn.Dropout(dropout_prob)

    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for the position-wise feed-forward network.

        This method takes an input tensor and applies two linear transformations 
        with a dropout layer in between.

        Parameters:
            input_tensor (torch.Tensor): The input tensor of shape (batch_size, seq_length, d_model).

        Returns:
            torch.Tensor: The output tensor of the same shape as input_tensor, after applying 
                          the feed-forward network transformations.
        """
        
        # Apply the first linear transformation
        intermediate_output = self.linear_layer1(input_tensor)
        
        # Apply dropout to the intermediate output
        dropped_output = self.dropout_layer(intermediate_output)
        
        # Apply the second linear transformation
        output_tensor = self.linear_layer2(dropped_output)
        
        return output_tensor
    
class Encoder(nn.Module):
    """
    Implements the encoder part of a transformer model.

    This class consists of multiple layers of the transformer encoder, 
    each of which applies a self-attention mechanism followed by 
    a feed-forward neural network. A layer normalization is applied 
    at the end of the encoder.

    Attributes:
        layers (nn.ModuleList): A list containing multiple encoder layers.
        layer_norm (nn.LayerNorm): Layer normalization applied to the output.

    Parameters:
        encoder_layer (nn.Module): A single encoder layer instance.
        num_layers (int): The number of encoder layers in the encoder stack.
    """

    def __init__(self, encoder_layer: nn.Module, num_layers: int):
        super().__init__()
        self.layers = nn.ModuleList([deepcopy(encoder_layer) for _ in range(num_layers)])
        self.layer_norm = nn.LayerNorm(encoder_layer.layer_size)

    def forward(self, input_tensor: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the encoder.

        This method applies each encoder layer sequentially to the input tensor 
        and then applies layer normalization to the final output.

        Parameters:
            input_tensor (torch.Tensor): The input tensor of shape (batch_size, seq_length, d_model).
            mask (torch.Tensor): A mask tensor to prevent attending to certain positions.

        Returns:
            torch.Tensor: The normalized output tensor after passing through all encoder layers.
        """
        
        for encoder_layer in self.layers:
            input_tensor = encoder_layer(input_tensor, mask)
        
        output_tensor = self.layer_norm(input_tensor)
        return output_tensor
    
class Decoder(nn.Module):
    """
    Implements the decoder part of a transformer model.

    This class consists of multiple layers of the transformer decoder,
    each applying self-attention and encoder-decoder attention mechanisms.
    A layer normalization is applied to the output of the decoder.

    Attributes:
        layers (nn.ModuleList): A list containing multiple decoder layers.
        layer_norm (nn.LayerNorm): Layer normalization applied to the output.

    Parameters:
        decoder_layer (nn.Module): A single decoder layer instance.
        num_layers (int): The number of decoder layers in the decoder stack.
    """

    def __init__(self, decoder_layer: nn.Module, num_layers: int):
        super().__init__()
        self.layers = nn.ModuleList([deepcopy(decoder_layer) for _ in range(num_layers)])
        self.layer_norm = nn.LayerNorm(decoder_layer.layer_size)

    def forward(self, input_tensor: torch.Tensor, memory_tensor: torch.Tensor, src_mask: torch.Tensor, tgt_mask: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the decoder.

        This method applies each decoder layer sequentially to the input tensor 
        using the provided memory tensor, source mask, and target mask. 
        Layer normalization is applied to the final output.

        Parameters:
            input_tensor (torch.Tensor): The input tensor of shape (batch_size, seq_length, d_model).
            memory_tensor (torch.Tensor): The output from the encoder to attend to.
            src_mask (torch.Tensor): A mask tensor to prevent attending to certain positions in the source.
            tgt_mask (torch.Tensor): A mask tensor to prevent attending to certain positions in the target.

        Returns:
            torch.Tensor: The normalized output tensor after passing through all decoder layers.
        """
        
        for decoder_layer in self.layers:
            input_tensor = decoder_layer(input_tensor, memory_tensor, src_mask, tgt_mask)
        
        output_tensor = self.layer_norm(input_tensor)
        return output_tensor
    

class SublayerConnection(nn.Module):
    """
    Implements a sublayer connection for transformer models.

    This class combines the output of a sublayer (like self-attention or feed-forward layers)
    with the input tensor through a residual connection followed by layer normalization and dropout.

    Attributes:
        layer_norm (nn.LayerNorm): Layer normalization applied to the input.
        dropout_layer (nn.Dropout): Dropout layer for regularization.

    Parameters:
        size (int): The dimensionality of the input tensor (d_model).
        dropout_prob (float): The dropout probability for regularization.
    """

    def __init__(self, size: int, dropout_prob: float):
        super().__init__()
        self.layer_norm = nn.LayerNorm(size)
        self.dropout_layer = nn.Dropout(dropout_prob)

    def forward(self, input_tensor: torch.Tensor, sublayer: Callable[[torch.Tensor], torch.Tensor]) -> torch.Tensor:
        """
        Forward pass through the sublayer connection.

        This method applies layer normalization to the input tensor, 
        passes it through a sublayer, applies dropout, and then 
        adds the original input tensor (residual connection) 
        to produce the output tensor.

        Parameters:
            input_tensor (torch.Tensor): The input tensor of shape (batch_size, seq_length, d_model).
            sublayer (Callable[[torch.Tensor], torch.Tensor]): A function that takes a tensor as input
                and returns a tensor as output, representing the sublayer operation.

        Returns:
            torch.Tensor: The output tensor after applying the sublayer connection.
        """
        
        normalized_input = self.layer_norm(input_tensor)
        sublayer_output = sublayer(normalized_input)
        dropped_output = self.dropout_layer(sublayer_output)
        output_tensor = input_tensor + dropped_output
        
        return output_tensor
    
class EncoderLayer(nn.Module):
    """
    Implements a single layer of the transformer encoder.

    This class contains a self-attention mechanism followed by a feed-forward network. 
    Each operation is wrapped in a sublayer connection to apply layer normalization 
    and residual connections.

    Attributes:
        self_attention (nn.Module): The self-attention mechanism for this layer.
        feed_forward_network (nn.Module): The feed-forward network for this layer.
        sublayer_connections (nn.ModuleList): A list of sublayer connections for the self-attention 
                                               and feed-forward operations.
        layer_size (int): The dimensionality of the input and output tensors.

    Parameters:
        layer_size (int): The dimensionality of the input tensor (d_model).
        self_attention (nn.Module): An instance of the self-attention mechanism.
        feed_forward_network (nn.Module): An instance of the feed-forward network.
        dropout_prob (float): The dropout probability for regularization.
    """

    def __init__(self, layer_size: int, self_attention: nn.Module, feed_forward_network: nn.Module, dropout_prob: float):
        super().__init__()
        self.self_attention = self_attention
        self.feed_forward_network = feed_forward_network
        self.sublayer_connections = nn.ModuleList([
            deepcopy(SublayerConnection(layer_size, dropout_prob)) for _ in range(2)
        ])
        self.layer_size = layer_size

    def forward(self, input_tensor: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the encoder layer.

        This method applies self-attention to the input tensor, followed by a 
        feed-forward network. Each operation is wrapped in a sublayer connection 
        to ensure proper normalization and residual connections.

        Parameters:
            input_tensor (torch.Tensor): The input tensor of shape (batch_size, seq_length, d_model).
            mask (torch.Tensor): A mask tensor to prevent attending to certain positions.

        Returns:
            torch.Tensor: The output tensor after processing through the encoder layer.
        """
        
        # Apply self-attention with residual connection
        attended_output = self.sublayer_connections[0](
            input_tensor, lambda x: self.self_attention(x, x, x, mask)
        )
        
        # Apply feed-forward network with residual connection
        output_tensor = self.sublayer_connections[1](attended_output, self.feed_forward_network)
        
        return output_tensor
    
class DecoderLayer(nn.Module):
    """
    Implements a single layer of the transformer decoder.

    This class contains mechanisms for self-attention and encoder-decoder attention,
    followed by a feed-forward network. Each operation is wrapped in a sublayer connection
    to apply layer normalization and residual connections.

    Attributes:
        layer_size (int): The dimensionality of the input and output tensors (d_model).
        self_attention (nn.Module): The self-attention mechanism for this layer.
        source_attention (nn.Module): The attention mechanism that attends to encoder outputs.
        feed_forward_network (nn.Module): The feed-forward network for this layer.
        sublayer_connections (nn.ModuleList): A list of sublayer connections for self-attention,
                                               source attention, and feed-forward operations.

    Parameters:
        layer_size (int): The dimensionality of the input tensor (d_model).
        self_attention (nn.Module): An instance of the self-attention mechanism.
        source_attention (nn.Module): An instance of the encoder-decoder attention mechanism.
        feed_forward_network (nn.Module): An instance of the feed-forward network.
        dropout_prob (float): The dropout probability for regularization.
    """

    def __init__(self, layer_size: int, self_attention: nn.Module, source_attention: nn.Module,
                 feed_forward_network: nn.Module, dropout_prob: float):
        super().__init__()
        self.layer_size = layer_size
        self.self_attention = self_attention
        self.source_attention = source_attention
        self.feed_forward_network = feed_forward_network
        self.sublayer_connections = nn.ModuleList([
            deepcopy(SublayerConnection(layer_size, dropout_prob)) for _ in range(3)
        ])

    def forward(self, input_tensor: torch.Tensor, memory_tensor: torch.Tensor, 
                src_mask: torch.Tensor, tgt_mask: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the decoder layer.

        This method applies self-attention to the input tensor, followed by 
        attention to the encoder output (memory), and finally applies a 
        feed-forward network. Each operation is wrapped in a sublayer 
        connection to ensure proper normalization and residual connections.

        Parameters:
            input_tensor (torch.Tensor): The input tensor of shape (batch_size, seq_length, d_model).
            memory_tensor (torch.Tensor): The output from the encoder to attend to.
            src_mask (torch.Tensor): A mask tensor to prevent attending to certain positions in the source.
            tgt_mask (torch.Tensor): A mask tensor to prevent attending to certain positions in the target.

        Returns:
            torch.Tensor: The output tensor after processing through the decoder layer.
        """
        
        # Apply self-attention with residual connection
        attended_output = self.sublayer_connections[0](
            input_tensor, lambda x: self.self_attention(x, x, x, tgt_mask)
        )
        
        # Apply source attention with residual connection
        attended_output = self.sublayer_connections[1](
            attended_output, lambda x: self.source_attention(x, memory_tensor, memory_tensor, src_mask)
        )
        
        # Apply feed-forward network with residual connection
        output_tensor = self.sublayer_connections[2](attended_output, self.feed_forward_network)
        
        return output_tensor
    
class Transformer(nn.Module):
    """
    Implements the Transformer architecture as described in "Attention is All You Need".

    This class consists of an encoder and a decoder, with embedding layers for 
    source and target sequences, as well as a generator for output predictions.

    Attributes:
        encoder (nn.Module): The encoder component of the Transformer.
        decoder (nn.Module): The decoder component of the Transformer.
        source_embedding (nn.Module): The embedding layer for source sequences.
        target_embedding (nn.Module): The embedding layer for target sequences.
        generator (nn.Module): The final layer for generating predictions from the decoder output.

    Parameters:
        encoder (nn.Module): An instance of the encoder module.
        decoder (nn.Module): An instance of the decoder module.
        source_embedding (nn.Module): An instance of the source embedding layer.
        target_embedding (nn.Module): An instance of the target embedding layer.
        generator (nn.Module): An instance of the generator layer for producing output.
    """

    def __init__(self, encoder: nn.Module, decoder: nn.Module,
                 source_embedding: nn.Module, target_embedding: nn.Module, 
                 generator: nn.Module):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.source_embedding = source_embedding
        self.target_embedding = target_embedding
        self.generator = generator

    def encode(self, source: torch.Tensor, source_mask: torch.Tensor) -> torch.Tensor:
        """
        Encodes the source sequences using the encoder.

        This method applies the source embedding followed by the encoder.

        Parameters:
            source (torch.Tensor): The input source tensor of shape (batch_size, src_seq_length).
            source_mask (torch.Tensor): A mask tensor to prevent attending to certain positions in the source.

        Returns:
            torch.Tensor: The encoded representation of the source sequences.
        """
        embedded_source = self.source_embedding(source)
        return self.encoder(embedded_source, source_mask)

    def decode(self, memory: torch.Tensor, source_mask: torch.Tensor, 
               target: torch.Tensor, target_mask: torch.Tensor) -> torch.Tensor:
        """
        Decodes the target sequences using the decoder.

        This method applies the target embedding followed by the decoder using the encoder's output as memory.

        Parameters:
            memory (torch.Tensor): The encoded representation from the encoder.
            source_mask (torch.Tensor): A mask tensor for the source.
            target (torch.Tensor): The input target tensor of shape (batch_size, tgt_seq_length).
            target_mask (torch.Tensor): A mask tensor to prevent attending to certain positions in the target.

        Returns:
            torch.Tensor: The output tensor after decoding the target sequences.
        """
        embedded_target = self.target_embedding(target)
        return self.decoder(embedded_target, memory, source_mask, target_mask)

    def forward(self, source: torch.Tensor, target: torch.Tensor, 
                source_mask: torch.Tensor, target_mask: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the Transformer.

        This method encodes the source sequences and then decodes the target sequences
        to produce the final output.

        Parameters:
            source (torch.Tensor): The input source tensor of shape (batch_size, src_seq_length).
            target (torch.Tensor): The input target tensor of shape (batch_size, tgt_seq_length).
            source_mask (torch.Tensor): A mask tensor for the source.
            target_mask (torch.Tensor): A mask tensor for the target.

        Returns:
            torch.Tensor: The output tensor after processing the source and target sequences.
        """
        memory = self.encode(source, source_mask)
        output = self.decode(memory, source_mask, target, target_mask)
        return output
    
class Generator(nn.Module):
    """
    A linear projection layer followed by a log softmax activation.

    This class is used to convert the output of the Transformer model into 
    a probability distribution over the target vocabulary.

    Attributes:
        projection_layer (nn.Linear): A linear layer that projects the model output to the vocabulary size.

    Parameters:
        model_dimension (int): The dimensionality of the model (d_model).
        vocabulary_size (int): The size of the target vocabulary.
    """

    def __init__(self, model_dimension: int, vocabulary_size: int):
        super().__init__()
        self.projection_layer = nn.Linear(model_dimension, vocabulary_size)

    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the generator.

        This method applies a linear transformation to the input tensor and 
        then applies the log softmax function to produce probabilities.

        Parameters:
            input_tensor (torch.Tensor): The input tensor of shape (batch_size, seq_length, d_model).

        Returns:
            torch.Tensor: A tensor containing the log probabilities of shape (batch_size, seq_length, vocab_size).
        """
        linear_output = self.projection_layer(input_tensor)
        log_probs = nn.functional.log_softmax(linear_output, dim=-1)
        return log_probs

In [14]:
def create_transformer_model(source_vocab_size: int, target_vocab_size: int, 
                              num_layers: int, model_dimension: int,
                              feed_forward_dimension: int, num_heads: int, 
                              dropout_rate: float = 0.1) -> nn.Module:
    """
    Creates a Transformer model for sequence-to-sequence tasks.

    This function initializes all the components of the Transformer architecture,
    including multi-headed attention, position-wise feed-forward networks, 
    positional encodings, an encoder, a decoder, and a generator for output.

    Parameters:
        source_vocab_size (int): The size of the source vocabulary.
        target_vocab_size (int): The size of the target vocabulary.
        num_layers (int): The number of encoder and decoder layers.
        model_dimension (int): The dimensionality of the model (d_model).
        feed_forward_dimension (int): The dimensionality of the feed-forward network (d_ff).
        num_heads (int): The number of attention heads.
        dropout_rate (float, optional): The dropout rate. Defaults to 0.1.

    Returns:
        nn.Module: An instance of the Transformer model.
    """
    # Initialize components of the Transformer
    multi_head_attention = MultiHeadedAttention(num_heads, model_dimension).to(DEVICE)
    feed_forward_network = PositionwiseFeedForward(model_dimension, feed_forward_dimension, dropout_rate).to(DEVICE)
    positional_encoding = PositionalEncoding(model_dimension, dropout_rate).to(DEVICE)

    # Build the Encoder and Decoder
    encoder = Encoder(EncoderLayer(model_dimension, deepcopy(multi_head_attention), 
                                    deepcopy(feed_forward_network), dropout_rate).to(DEVICE), 
                      num_layers).to(DEVICE)

    decoder = Decoder(DecoderLayer(model_dimension, deepcopy(multi_head_attention), 
                                    deepcopy(multi_head_attention), 
                                    deepcopy(feed_forward_network), dropout_rate).to(DEVICE), 
                      num_layers).to(DEVICE)

    # Create embeddings and combine with positional encodings
    source_embedding = nn.Sequential(Embeddings(model_dimension, source_vocab_size).to(DEVICE), 
                                      deepcopy(positional_encoding))
    target_embedding = nn.Sequential(Embeddings(model_dimension, target_vocab_size).to(DEVICE), 
                                      deepcopy(positional_encoding))

    # Create the final model
    transformer_model = Transformer(encoder, decoder, source_embedding, target_embedding, 
                                     Generator(model_dimension, target_vocab_size)).to(DEVICE)

    # Initialize model parameters
    for parameter in transformer_model.parameters():
        if parameter.dim() > 1:
            nn.init.xavier_uniform_(parameter)

    return transformer_model.to(DEVICE)

In [15]:
model = create_transformer_model(source_vocab_size=src_vocab, 
                                 target_vocab_size=tgt_vocab, 
                                 num_layers=6, 
                                 model_dimension=256, 
                                 feed_forward_dimension=1024, 
                                 num_heads=8, 
                                 dropout_rate=0.1)

## 5: Loss Function

In [16]:
class LabelSmoothing(nn.Module):
    """
    Implements label smoothing to regularize the model by assigning a small amount of probability mass to 
    all other labels besides the correct one.
    
    Label smoothing helps in preventing overconfidence of the model by reducing the probability assigned to 
    the correct label and redistributing it across other classes.
    
    Attributes:
        criterion (nn.KLDivLoss): The loss function used for calculating the Kullback-Leibler divergence between the smoothed distribution and the model output.
        padding_index (int): The index in the vocabulary that represents padding, to be excluded from smoothing.
        confidence (float): The confidence assigned to the correct label (1.0 - smoothing).
        smoothing (float): The smoothing factor, which is the amount of probability mass distributed to other labels.
        vocab_size (int): The size of the vocabulary or number of possible output classes.
        true_distribution (Optional[torch.Tensor]): The smoothed true label distribution.
    """
    
    def __init__(self, vocab_size: int, padding_index: int, smoothing: float = 0.1):
        """
        Initializes the LabelSmoothing module.

        Parameters:
            vocab_size (int): The size of the output vocabulary (number of classes).
            padding_index (int): The index used for padding in the target sequences.
            smoothing (float, optional): The label smoothing factor. Defaults to 0.1.
        """
        super().__init__()
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.padding_index = padding_index
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.vocab_size = vocab_size
        self.true_distribution = None

    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
        """
        Forward pass to compute the loss with label smoothing.

        Parameters:
            logits (torch.Tensor): The model predictions (logits) of shape (batch_size, num_classes).
            targets (torch.Tensor): The true target labels of shape (batch_size).

        Returns:
            torch.Tensor: The computed loss value using smoothed target distribution.
        """
        assert logits.size(1) == self.vocab_size, "Logits and vocab size must match."

        # Clone logits to create a smoothed distribution
        smoothed_distribution = logits.data.clone()
        smoothed_distribution.fill_(self.smoothing / (self.vocab_size - 2))
        smoothed_distribution.scatter_(1, targets.data.unsqueeze(1), self.confidence)

        # Set smoothing for padding tokens to zero
        smoothed_distribution[:, self.padding_index] = 0
        padding_mask = torch.nonzero(targets.data == self.padding_index)

        if padding_mask.dim() > 0:
            smoothed_distribution.index_fill_(0, padding_mask.squeeze(), 0.0)

        self.true_distribution = smoothed_distribution  # Store for reference
        loss = self.criterion(logits, smoothed_distribution.clone().detach())

        return loss
    
class NoamScheduler:
    """
    Implements the learning rate scheduler with warmup.

    This learning rate scheduler is designed for models such as the Transformer, where the learning rate starts 
    small, increases linearly during a warmup phase, and then decays proportionally to the inverse square root 
    of the step number after the warmup phase.

    Attributes:
        model_dimension (int): The dimensionality of the model (e.g., d_model in the Transformer).
        factor (float): A constant scaling factor for the learning rate.
        warmup_steps (int): The number of warmup steps before the learning rate begins to decay.
        optimizer (Optimizer): The optimizer whose learning rate will be updated.
        step_count (int): The current step count in the training process.
        current_rate (float): The current learning rate after the last update.
    """

    def __init__(self, model_dimension: int, factor: float, warmup_steps: int, optimizer: Optimizer):
        """
        Initializes the NoamScheduler.

        Parameters:
            model_dimension (int): The dimensionality of the model (e.g., d_model in Transformer).
            factor (float): A scaling factor for the learning rate.
            warmup_steps (int): The number of warmup steps before learning rate decay.
            optimizer (Optimizer): The optimizer that will be updated with the learning rate.
        """
        self.optimizer = optimizer
        self.step_count = 0
        self.warmup_steps = warmup_steps
        self.factor = factor
        self.model_dimension = model_dimension
        self.current_rate = 0.0

    def step(self) -> None:
        """
        Update the learning rate and step count, then apply the optimizer step.

        This method should be called after every batch in training to update the learning rate based on the current step.
        """
        self.step_count += 1
        rate = self.compute_rate()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = rate
        self.current_rate = rate
        self.optimizer.step()

    def compute_rate(self, step: Optional[int] = None) -> float:
        """
        Compute the learning rate based on the current step or a given step.

        Parameters:
            step (Optional[int]): A specific step to compute the learning rate for. If None, uses the current step count.

        Returns:
            float: The calculated learning rate based on the Noam formula.
        """
        if step is None:
            step = self.step_count

        return self.factor * (self.model_dimension ** -0.5) * min(step ** -0.5, step * self.warmup_steps ** -1.5)
    
class LossComputer:
    """
    A simple loss computation and optimization class for training.

    This class wraps the generator, loss function (criterion), and optimizer, and provides a callable
    interface to compute the loss, backpropagate, and update the optimizer (if provided).

    Attributes:
        generator (nn.Module): The model's generator used to produce final predictions.
        criterion (nn.Module): The loss function (criterion) to compute the loss, typically cross-entropy or label smoothing.
        optimizer (Optional[Optimizer]): The optimizer used for updating model weights.
    """

    def __init__(self, generator: nn.Module, criterion: nn.Module, optimizer: Optional[Optimizer] = None):
        """
        Initializes the LossComputer.

        Parameters:
            generator (nn.Module): The generator network (usually a linear layer followed by softmax or log-softmax).
            criterion (nn.Module): The loss function to compute the error between predicted and true labels.
            optimizer (Optional[Optimizer]): An optional optimizer for updating the model parameters.
        """
        self.generator = generator
        self.criterion = criterion
        self.optimizer = optimizer

    def __call__(self, predictions: torch.Tensor, target: torch.Tensor, normalization: torch.Tensor) -> float:
        """
        Compute the loss, backpropagate, and update the optimizer (if provided).

        Parameters:
            predictions (torch.Tensor): The output predictions from the model, before applying the generator.
            target (torch.Tensor): The ground-truth target labels.
            normalization (torch.Tensor): A scalar tensor used to normalize the loss, often the number of non-padding tokens.

        Returns:
            float: The computed loss, scaled by the normalization factor.
        """
        # Pass the predictions through the generator to get final logits
        logits = self.generator(predictions)

        # Compute the loss using the criterion
        loss = self.criterion(
            logits.contiguous().view(-1, logits.size(-1)),
            target.contiguous().view(-1)
        ) / normalization

        # Backpropagate the loss
        loss.backward()

        # If an optimizer is provided, update the model's parameters
        if self.optimizer is not None:
            self.optimizer.step()
            self.optimizer.optimizer.zero_grad()

        # Return the loss value multiplied by the normalization factor
        return loss.item() * normalization

## 6: Training

In [17]:
adam_opt = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
optimizer = NoamScheduler(model_dimension=256, factor=1, warmup_steps=2000, optimizer=adam_opt)

In [18]:
criterion = LabelSmoothing(vocab_size=tgt_vocab,padding_index=0, smoothing=0.1)
loss_func = LossComputer(model.generator, criterion, optimizer)

In [19]:
for epoch in range(NEPOCH):
    model.train()
    tloss=0
    tokens=0
    for batch in batches:
        out = model(batch.src, batch.trg, 
                    batch.src_mask, batch.trg_mask)
        
        loss = loss_func(out, batch.trg_y, batch.ntokens)
        tloss += loss
        tokens += batch.ntokens
    print(f"Epoch {epoch}, average loss: {tloss/tokens:.3f}")

Epoch 0, average loss: 5.822
Epoch 1, average loss: 3.627
Epoch 2, average loss: 2.825
Epoch 3, average loss: 2.180
Epoch 4, average loss: 1.786
Epoch 5, average loss: 1.585
Epoch 6, average loss: 1.413
Epoch 7, average loss: 1.272
Epoch 8, average loss: 1.172
Epoch 9, average loss: 1.079
Epoch 10, average loss: 1.004
Epoch 11, average loss: 0.941
Epoch 12, average loss: 0.891
Epoch 13, average loss: 0.844
Epoch 14, average loss: 0.802
Epoch 15, average loss: 0.763
Epoch 16, average loss: 0.728
Epoch 17, average loss: 0.701
Epoch 18, average loss: 0.674
Epoch 19, average loss: 0.646
Epoch 20, average loss: 0.625
Epoch 21, average loss: 0.606
Epoch 22, average loss: 0.589
Epoch 23, average loss: 0.570
Epoch 24, average loss: 0.555
Epoch 25, average loss: 0.539
Epoch 26, average loss: 0.527
Epoch 27, average loss: 0.515
Epoch 28, average loss: 0.503
Epoch 29, average loss: 0.487
Epoch 30, average loss: 0.479
Epoch 31, average loss: 0.469
Epoch 32, average loss: 0.460
Epoch 33, average lo

In [20]:
os.makedirs('models', exist_ok=True)

In [21]:
torch.save(model.state_dict(), f"models/translation/model_{NEPOCH}.pth")  

## 7: Prediction

In [22]:
def tokenize_and_convert_to_indices(sentence: str, tokenizer, vocab_dict: dict, unk_token: int) -> torch.Tensor:
    """
    Tokenizes the input sentence and converts tokens to their corresponding indices.

    Args:
        sentence (str): The input sentence to tokenize and convert.
        tokenizer: The tokenizer used to split the sentence into tokens.
        vocab_dict (dict): A dictionary mapping tokens to their respective indices.
        unk_token (int): The index to use for unknown tokens.

    Returns:
        torch.Tensor: A tensor of token indices on the specified DEVICE.
    """
    tokenized_sentence = ["BOS"] + tokenizer.tokenize(sentence) + ["EOS"]
    token_indices = [vocab_dict.get(token, unk_token) for token in tokenized_sentence]
    return torch.tensor(token_indices).long().to(DEVICE).unsqueeze(0)

def create_src_mask(src_tensor: torch.Tensor) -> torch.Tensor:
    """
    Creates a source mask for the input tensor to hide padding tokens.

    Args:
        src_tensor (torch.Tensor): The source tensor for which to create the mask.

    Returns:
        torch.Tensor: A mask for the source tensor with padding tokens masked out.
    """
    return (src_tensor != 0).unsqueeze(-2)

def generate_translation(model, memory: torch.Tensor, src_mask: torch.Tensor, 
                         start_symbol: int, idx_to_token: dict) -> list:
    """
    Generates a translation for the input sequence in an autoregressive fashion.

    Args:
        model: The transformer model to use for decoding.
        memory (torch.Tensor): The encoded memory from the source sentence.
        src_mask (torch.Tensor): The mask for the source sentence.
        start_symbol (int): The start symbol in the target vocabulary.
        idx_to_token (dict): A dictionary mapping indices to tokens in the target language.

    Returns:
        list: A list of translated tokens.
    """
    ys = torch.ones(1, 1).fill_(start_symbol).long().to(DEVICE)  
    translation_tokens = []

    for _ in range(MAX_LENGTH):
        out = model.decode(memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(ys))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        if idx_to_token[next_word] == 'EOS':
            break

        ys = torch.cat([ys, torch.ones(1, 1).fill_(next_word).long().to(DEVICE)], dim=1)  
        translation_tokens.append(idx_to_token[next_word])

    return translation_tokens

def post_process_translation(tokens: list) -> str:
    """
    Post-processes the list of translated tokens into a coherent sentence.

    Args:
        tokens (list): List of tokens representing the translation.

    Returns:
        str: The final post-processed sentence.
    """
    sentence = "".join(tokens)
    sentence = sentence.replace("</w>", " ")

    punctuation = '''?:;.,'("-!&)%'''
    for punc in punctuation:
        sentence = sentence.replace(f" {punc}", f"{punc}")

    return sentence

def translate(
    sentence: str, 
    model: nn.Module, 
    tokenizer, 
    source_vocab: dict, 
    target_vocab: dict, 
    idx_to_target_token: dict
) -> str:
    """
    Translates a given English sentence into the target language using a transformer model.

    Args:
        sentence (str): The English sentence to translate.
        model (nn.Module): The transformer model used for translation.
        tokenizer: Tokenizer to convert the sentence into tokens.
        source_vocab (dict): Dictionary mapping English tokens to their corresponding indices.
        target_vocab (dict): Dictionary mapping target language tokens to their corresponding indices.
        idx_to_target_token (dict): Dictionary mapping indices back to the target language tokens.

    Returns:
        str: The translated sentence in the target language.
    """
    # Step 1: Tokenize the English sentence and convert tokens to indices
    source_tensor = tokenize_and_convert_to_indices(sentence, tokenizer, source_vocab, UNK)
    
    # Step 2: Create a source mask to mask out padding tokens
    source_mask = create_src_mask(source_tensor)
    
    # Step 3: Encode the input sentence into memory using the transformer model
    memory = model.encode(source_tensor, source_mask)
    
    # Step 4: Autoregressively generate the translated sentence in the target language
    start_symbol = target_vocab["BOS"]
    translation_tokens = generate_translation(
        model, memory, source_mask, 
        start_symbol=start_symbol, idx_to_token=idx_to_target_token
    )
    
    # Step 5: Post-process the translation tokens into a readable sentence
    translated_sentence = post_process_translation(translation_tokens)
    
    return translated_sentence.rstrip()

In [23]:
%%capture
trained_weights=torch.load(f"models/translation/model_{NEPOCH}.pth", map_location=DEVICE)
model.load_state_dict(trained_weights)
model.eval()

In [54]:
# sentence = 'he is happy.'
# sentence = 'he is not happy and he wants to go to the police.'
sentence = 'my wife wants to eat chicken.'
# sentence = "I love skiing in the winter!"

In [55]:
translate(sentence, 
          model, 
          tokenizer, 
          en_word_dict, 
          fr_word_dict, 
          fr_idx_dict)

'ma femme veut manger du poulet.'