In [80]:
# In this notebook, you learn:
#
# 1) Steps involved in the machine translation inference process. In this notebook, we use greedy search to generate translations.

In [28]:
import sys
sys.path.append('../../')

In [50]:
import datasets
import torch

from abc import ABC, abstractmethod
from model_implementation.utils.helpers import get_absolute_path
from tokenizers import ByteLevelBPETokenizer # type: ignore
from torch import nn, Tensor
from typing import Callable, Optional, List

#### The next few cells holds the preparation part for inference. All the content here is already explained in the previous notebooks. So, you can run the next few cells until the 'main part' (mentioned below) blindly.

In [40]:
# Start of the sequence token.
START_TOKEN = "<sos>"
# End of the sequence token.
END_TOKEN = "<eos>"
# Padding token.
PAD_TOKEN = "<pad>"
# Token used to represent out-of-vocabulary words.
UNK_TOKEN = "<unk>"
# Maximum vocabulary size.
MAX_VOCAB_SIZE = 30000
# Maximum number of tokens allowed to be predicted during inference.
MAX_INFERENCE_SEQ_LEN = 150
# Size of the model's hidden state.
D_MODEL = 10
# Path to the dataset to be used for Tokenizer training.
TOKENIZER_DATA_PATH = "../../Data/AI4Bharat/full_en_te_dataset"

In [31]:
# This is a base class that will be inherited by the actual tokenizer classes.
class BaseTokenizer(ABC):
    """A class created to hold different kinds of tokenizers and handle the token encoding in a common way.
       Here, we only use SpacyTokenizer and HuggingFaceTokenizer."""
    def __init__(self, language: str, tokenizer_type: str):
        self.language = language
        self.tokenizer_type = tokenizer_type
        self.special_tokens = [START_TOKEN, END_TOKEN, PAD_TOKEN, UNK_TOKEN]

    @abstractmethod
    def initialize_tokenizer_and_build_vocab(self, 
                                             data_iterator: datasets.arrow_dataset.Dataset, 
                                             text_extractor: Callable[[dict[str, str], str], str], 
                                             max_vocab_size: Optional[int] = MAX_VOCAB_SIZE):
        pass

    @abstractmethod
    def tokenize(self, text: str) -> list[str]:
        pass

    @abstractmethod
    def encode(self, text: str) -> list[int]:
        pass

    def decode(self, token_ids: List[int]) -> str:
        pass

    @abstractmethod
    def get_token_id(self, token: str) -> int:
        pass

    @abstractmethod
    def get_vocab_size(self) -> int:
        pass



class BPETokenizer(BaseTokenizer):
    """Trains a tokenizer using HuggingFace libraries"""
    def __init__(self, language: str):
        super().__init__(language, "bpe")

    def initialize_tokenizer_and_build_vocab(self, 
                                             data_iterator: datasets.arrow_dataset.Dataset, 
                                             text_extractor: Callable[[dict[str, str], str], str], 
                                             max_vocab_size: Optional[int] = MAX_VOCAB_SIZE):
        self.max_vocab_size = max_vocab_size
        self.tokenizer = self.__train_tokenizer(data_iterator=data_iterator, text_extractor=text_extractor, max_vocab_size=max_vocab_size)

    def tokenize(self, text: str) -> list[str]:
        encoded_text = self.tokenizer.encode(text)
        return encoded_text.tokens

    def encode(self, text: str) -> list[int]:
        encoded_text = self.tokenizer.encode(text)
        return encoded_text.ids

    def decode(self, token_ids: List[int]) -> str:
        return self.tokenizer.decode(token_ids)

    def get_token_id(self, token: str) -> int:
        return self.tokenizer.token_to_id(token)

    def get_vocab_size(self) -> int:
        return self.tokenizer.get_vocab_size()

    def __get_data_iterator(self, data_iterator: datasets.arrow_dataset.Dataset, text_extractor: Callable[[dict[str, str], str], str]):
        for data_point in data_iterator:
            yield text_extractor(data_point=data_point, language=self.language) # type: ignore

    def __train_tokenizer(self, data_iterator: datasets.arrow_dataset.Dataset, 
                          text_extractor: Callable[[dict[str, str], str], str], 
                          max_vocab_size: Optional[int]=MAX_VOCAB_SIZE) -> ByteLevelBPETokenizer:
        # Use BPE to train a ByteLevel BPE tokenizer.
        tokenizer = ByteLevelBPETokenizer()
        # train_from_iterator is used so that the entire dataset is not loaded into memory at once.
        tokenizer.train_from_iterator(iterator=self.__get_data_iterator(data_iterator=data_iterator, text_extractor=text_extractor), 
                                      vocab_size= max_vocab_size, 
                                      special_tokens=self.special_tokens)
        return tokenizer

    
    def save_tokenizer_to_disk(self, directory_to_save: str):
        absolute_directory_path = get_absolute_path(relative_path=directory_to_save)
        self.tokenizer.save_model(absolute_directory_path)


    def load_trained_tokenizer_from_disk(self, saved_tokenizer_directory: str):
        absolute_directory_path = get_absolute_path(relative_path=saved_tokenizer_directory)
        self.tokenizer = ByteLevelBPETokenizer.from_file(vocab_filename=f"{absolute_directory_path}/vocab.json", 
                                                         merges_filename=f"{absolute_directory_path}/merges.txt")
        

def text_extractor(data_point: dict[str, str], language: str) -> str:
    if language == "english":
        return data_point["src"]
    elif language == "telugu":
        return data_point["tgt"]
    raise ValueError("Language should be either 'english' or 'telugu'.")


def construct_padding_mask(input: Tensor, pad_token_id: int) -> Tensor:
    mask = (input != pad_token_id)
    mask = mask.unsqueeze(1)
    return mask


def construct_look_ahead_mask(size: int) -> Tensor:
    attention_mask = torch.triu(torch.ones(size, size, dtype=torch.uint8), diagonal=1)
    return attention_mask == 0

In [32]:
en_te_translation_dataset = datasets.load_from_disk(TOKENIZER_DATA_PATH)
print(en_te_translation_dataset)
print(type(en_te_translation_dataset))

Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 4946035
})
<class 'datasets.arrow_dataset.Dataset'>


In [33]:
english_tokenizer = BPETokenizer(language="english")
english_tokenizer.initialize_tokenizer_and_build_vocab(data_iterator=en_te_translation_dataset, text_extractor=text_extractor, max_vocab_size=MAX_VOCAB_SIZE)






In [34]:
telugu_tokenizer = BPETokenizer(language="telugu")
telugu_tokenizer.initialize_tokenizer_and_build_vocab(data_iterator=en_te_translation_dataset, text_extractor=text_extractor, max_vocab_size=MAX_VOCAB_SIZE)






#### The main part specific to this notebook starts from here.

In [11]:
src_sentences = ["I am a Software Engineer at Google.", 
                 "How do I learn Machine Learing and start working on awesome ideas?", 
                 "Lets do a Masters in Data Science at good university.", 
                 "I watched The Boys tv show last week. It was awesome"]

In [41]:
# The first step in inference is to tokenize the source sentences.

tokenized_src_sequences: List[List[int]] = []
# Converts each source sentence into a list of token ids.
for sentence in src_sentences:
    tokenized_src_sequences.append(english_tokenizer.encode(sentence))
    # Confirm that the sentence is tokenized correctly and can be decoded back.
    print(english_tokenizer.decode(tokenized_src_sequences[-1]))
print("-" * 150)
print(tokenized_src_sequences)

I am a Software Engineer at Google.
How do I learn Machine Learing and start working on awesome ideas?
Lets do a Masters in Data Science at good university.
I watched The Boys tv show last week. It was awesome
------------------------------------------------------------------------------------------------------------------------------------------------------
[[44, 654, 262, 18571, 13324, 396, 5356, 17], [531, 464, 341, 2337, 26973, 477, 8020, 297, 1022, 1935, 332, 16570, 7335, 34], [3442, 464, 262, 17671, 285, 15400, 4671, 396, 839, 7315, 17], [44, 9597, 1029, 9711, 86, 260, 89, 1490, 1170, 2260, 17, 4357, 358, 16570]]


In [42]:
# These token ids should be the same in both src vocabulary and tgt vocabulary.
# Gets the token id for the PAD token.
pad_token_id = english_tokenizer.get_token_id(PAD_TOKEN)
print("pad token id: ", pad_token_id)
# Gets the token id for the START token.
start_token_id = telugu_tokenizer.get_token_id(START_TOKEN)
print("start token id: ", start_token_id)
# Gets the token id for the END token.
end_token_id = telugu_tokenizer.get_token_id(END_TOKEN)
print("end token id: ", end_token_id)

pad token id:  2
start token id:  0
end token id:  1


In [43]:
# We need to pad the input sequences to make them of the same length. This is because we need to batch
# the input sequences and pass them through the model in one go.

# Find the maximum length of the source sequences.
max_src_seq_len = max([len(seq) for seq in tokenized_src_sequences])
for src_seq in tokenized_src_sequences:
    # Pad the source sequence with pad_token_id to make it of length max_src_seq
    src_seq.extend([pad_token_id] * (max_src_seq_len - len(src_seq)))

# This should print the tokenized source sequences (as seen in the above cell) with padding.
print(tokenized_src_sequences)
assert len(tokenized_src_sequences[0]) == len(tokenized_src_sequences[-1])

[[44, 654, 262, 18571, 13324, 396, 5356, 17, 2, 2, 2, 2, 2, 2], [531, 464, 341, 2337, 26973, 477, 8020, 297, 1022, 1935, 332, 16570, 7335, 34], [3442, 464, 262, 17671, 285, 15400, 4671, 396, 839, 7315, 17, 2, 2, 2], [44, 9597, 1029, 9711, 86, 260, 89, 1490, 1170, 2260, 17, 4357, 358, 16570]]


In [44]:
# Convert the tokenized source sequences to a tensor since the model expects tensors as input.
src_batch = torch.tensor(data=tokenized_src_sequences, dtype=torch.int32)
print("shape of the source batch tensor: ", src_batch.shape)
print("src_batch: \n", src_batch)

shape of the source batch tensor:  torch.Size([4, 14])
src_batch: 
 tensor([[   44,   654,   262, 18571, 13324,   396,  5356,    17,     2,     2,
             2,     2,     2,     2],
        [  531,   464,   341,  2337, 26973,   477,  8020,   297,  1022,  1935,
           332, 16570,  7335,    34],
        [ 3442,   464,   262, 17671,   285, 15400,  4671,   396,   839,  7315,
            17,     2,     2,     2],
        [   44,  9597,  1029,  9711,    86,   260,    89,  1490,  1170,  2260,
            17,  4357,   358, 16570]], dtype=torch.int32)


In [45]:
# We need to create the attention mask for the source sequences. This is because the model should not attend to the 
# padding tokens.

# The attention mask is a tensor of shape [batch_size, 1, max_src_seq_len] where each element is True if the corresponding 
# element in the source sequence is not a padding token.
src_mask = construct_padding_mask(input=src_batch, pad_token_id=pad_token_id)
print("shape of the source mask tensor: ", src_mask.shape)
print("src_mask: \n", src_mask)
print("-" * 150)
# The same attention mask is applied to all the heads in the multi-head attention mechanism. To account for this, we
# create a tensor of shape [batch_size, 1, 1, max_src_seq_len] and let python broadcast it to the required shape which
# would be [batch_size, num_heads, max_tgt_seq_len, max_src_seq_len].
src_mask = src_mask.unsqueeze(1)
print("shape of the source mask tensor: ", src_mask.shape)
print("src_mask: \n", src_mask)

shape of the source mask tensor:  torch.Size([4, 1, 14])
src_mask: 
 tensor([[[ True,  True,  True,  True,  True,  True,  True,  True, False, False,
          False, False, False, False]],

        [[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True,  True]],

        [[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True, False, False, False]],

        [[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True,  True]]])
------------------------------------------------------------------------------------------------------------------------------------------------------
shape of the source mask tensor:  torch.Size([4, 1, 1, 14])
src_mask: 
 tensor([[[[ True,  True,  True,  True,  True,  True,  True,  True, False, False,
           False, False, False, False]]],


        [[[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
            True, 

In [46]:
# The next step is to pass the source sequences through the encoder to get the encoder output. The encoder output is a 
# tensor of shape [batch_size, max_src_seq_len, d_model].
#
# The operation would be as follows: encoded_src = translation_model.encode(src=src_batch, src_mask=src_mask)
# Instead of using the translation_model, we will use a random tensor to simulate the encoder output.
encoded_src = torch.randn(src_batch.shape[0], src_batch.shape[1], D_MODEL)
print("shape of the encoded source tensor: ", encoded_src.shape)
print("encoded_src: \n", encoded_src)

shape of the encoded source tensor:  torch.Size([4, 14, 10])
encoded_src: 
 tensor([[[-4.9424e-01,  2.2307e+00, -1.2217e+00, -1.1746e+00,  1.2302e+00,
          -4.9120e-01, -2.3567e-01, -7.6592e-01,  5.0781e-01, -2.1646e+00],
         [-6.8511e-01,  9.4651e-01,  6.3741e-01,  2.9685e-02,  2.6911e-01,
          -7.2600e-01,  5.9115e-01,  1.3728e+00, -6.4206e-01, -1.5254e+00],
         [-1.0006e+00,  1.3398e-01,  4.8169e-01, -1.0989e+00, -6.4641e-01,
           1.0005e+00, -1.6974e-01,  5.8577e-03, -1.1800e+00, -5.3509e-01],
         [ 1.6506e+00, -2.2694e+00, -1.2932e+00, -1.8767e-01, -1.1205e-01,
          -7.7379e-01,  1.1858e-01,  4.2639e-01, -6.1384e-01,  8.0423e-01],
         [-2.3534e-01,  1.9073e-01,  1.4027e+00, -3.0396e-01,  6.0874e-02,
          -4.1424e-01,  2.3196e-02, -7.5784e-01, -3.8664e-01, -7.8360e-02],
         [-1.2419e+00,  6.6320e-01,  1.3953e+00,  5.8699e-02, -1.7150e+00,
          -9.3034e-01, -2.1458e+00,  5.7369e-01, -1.2550e+00, -1.8055e+00],
         [-8.5708e

In [60]:
# Commonly used in every iteration. So, just creating it here.
log_softmax = nn.LogSoftmax(dim=-1)
print(log_softmax)

LogSoftmax(dim=-1)


#### Iteration 1 of the target token prediction.

In [61]:
# Now, we have to create the input for the Decoder. The input to the decoder is the target sequences. The target sequences
# are initially empty. So, we need to start with the START_TOKEN and then keep appending the tokens predicted by the model
# until the END_TOKEN is predicted.

tgt_decoder_input_iter_1 = torch.tensor(data=[[start_token_id] for _ in range(src_batch.size(0))], dtype=torch.int32)
print("shape of the target decoder input tensor: ", tgt_decoder_input_iter_1.shape)
print("target_decoder_input: \n", tgt_decoder_input_iter_1)

shape of the target decoder input tensor:  torch.Size([4, 1])
target_decoder_input: 
 tensor([[0],
        [0],
        [0],
        [0]], dtype=torch.int32)


In [79]:
# I initially thought not to use any tgt_mask during inference since it is not strictly required. However, the results
# are looking way too different without the tgt_mask. Did not expect such a huge difference. So, I will use the tgt_mask
# during inference as well.
tgt_mask = construct_look_ahead_mask(size=tgt_decoder_input_iter_1.size(1))
print("shape of the target mask tensor: ", tgt_mask.shape)
print("target_mask: \n", tgt_mask)
print("-" * 150)
# The look ahead mask is same for every sequence in the batch and so, we repeat the mask for every sequence in the batch.
tgt_mask = tgt_mask.unsqueeze(0).repeat(tgt_decoder_input_iter_1.size(0), 1, 1)
print("shape of the target mask tensor: ", tgt_mask.shape)
print("target_mask: \n", tgt_mask)
print("-" * 150)
# The same attention mask is applied to all the heads in the multi-head attention mechanism. To account for this, we
# create a tensor of shape [batch_size, 1, 1, max_tgt_seq_len] and let python broadcast it to the required shape which
# would be [batch_size, num_heads, max_tgt_seq_len, max_tgt_seq_len].
tgt_mask = tgt_mask.unsqueeze(1)
print("shape of the target mask tensor: ", tgt_mask.shape)
print("target_mask: \n", tgt_mask)

shape of the target mask tensor:  torch.Size([1, 1])
target_mask: 
 tensor([[True]])
------------------------------------------------------------------------------------------------------------------------------------------------------
shape of the target mask tensor:  torch.Size([4, 1, 1])
target_mask: 
 tensor([[[True]],

        [[True]],

        [[True]],

        [[True]]])
------------------------------------------------------------------------------------------------------------------------------------------------------
shape of the target mask tensor:  torch.Size([4, 1, 1, 1])
target_mask: 
 tensor([[[[True]]],


        [[[True]]],


        [[[True]]],


        [[[True]]]])


In [63]:
# The next step is to pass the target sequences through the decoder to get the decoder output. The decoder output is a
# tensor of shape [batch_size, tgt_seq_len, d_model]. Instead of using the translation_model, we will use a random tensor
# to simulate the decoder output.
# This cell is just for understanding purposes. We did not use this variable anywhere below in the notebook.
tgt_decoder_output_iter_1 = torch.randn(tgt_decoder_input_iter_1.size(0), tgt_decoder_input_iter_1.size(1), D_MODEL)
print("shape of the target decoder output tensor: ", tgt_decoder_output_iter_1.shape)
print("target_decoder_output: \n", tgt_decoder_output_iter_1)

shape of the target decoder output tensor:  torch.Size([4, 1, 10])
target_decoder_output: 
 tensor([[[-0.3742,  0.2313, -1.1678,  1.9597,  0.8790, -0.9169, -1.1343,
           0.0150,  0.1704,  0.0639]],

        [[ 0.0252, -1.4017,  0.4027,  0.5777, -0.6343, -0.2286, -0.4737,
           1.9160, -1.6930,  0.9677]],

        [[ 0.4050, -1.0697, -2.0056,  0.9286, -1.3345,  0.2843,  1.3475,
           0.8535,  1.4111, -0.9893]],

        [[-0.3084, -0.3953, -0.0435, -0.8055,  0.2498,  0.3684,  0.6436,
           1.4065,  0.4810,  0.1747]]])


In [64]:
# The next step is to convert the decoder output to probability distribution over the target vocabulary. This is done
# using the token prediction layer in the translation model. Instead of using the translation_model, we will use a random
# tensor to simulate output of the token prediction layer.
random_tensor_iter_1 = torch.randn(tgt_decoder_output_iter_1.size(0), tgt_decoder_output_iter_1.size(1), telugu_tokenizer.get_vocab_size())
predicted_log_probs_iter_1 = log_softmax(random_tensor_iter_1)
print("shape of the predicted log probabilities tensor: ", predicted_log_probs_iter_1.shape)
print("predicted_log_probs: \n", predicted_log_probs_iter_1)

shape of the predicted log probabilities tensor:  torch.Size([4, 1, 30000])
predicted_log_probs: 
 tensor([[[-11.7448,  -9.3692, -10.9705,  ...,  -9.2906, -11.1848, -11.3099]],

        [[-12.7606,  -7.4802, -11.5967,  ..., -10.2324, -10.0172,  -9.8340]],

        [[ -9.1747,  -8.3682, -10.5096,  ..., -11.9631, -11.8825, -10.6328]],

        [[-10.1766,  -9.3968, -10.7139,  ..., -13.2947, -11.3629, -10.3721]]])


In [65]:
# In this step, we will have to extract the token with maximum probability from the predicted_log_probs tensor. This token
# will be appended to the target sequences. This process will be repeated until the END_TOKEN is predicted or the maximum
# number of tokens is reached.
# 
# We only care about the last token in the predicted_log_probs tensor. So, we will extract the last token log probabilities.
predicted_last_tok_log_probs_iter_1 = predicted_log_probs_iter_1[:, -1, :]
print("shape of the predicted last token log probabilities tensor: ", predicted_last_tok_log_probs_iter_1.shape)
print("predicted_last_tok_log_probs: \n", predicted_last_tok_log_probs_iter_1)

shape of the predicted last token log probabilities tensor:  torch.Size([4, 30000])
predicted_last_tok_log_probs: 
 tensor([[-11.7448,  -9.3692, -10.9705,  ...,  -9.2906, -11.1848, -11.3099],
        [-12.7606,  -7.4802, -11.5967,  ..., -10.2324, -10.0172,  -9.8340],
        [ -9.1747,  -8.3682, -10.5096,  ..., -11.9631, -11.8825, -10.6328],
        [-10.1766,  -9.3968, -10.7139,  ..., -13.2947, -11.3629, -10.3721]])


In [66]:
# Extract the token with maximum probability i.e., basically the index of the token with maximum probability.
max_probs_iter_1, predicted_tokens_iter_1 = predicted_last_tok_log_probs_iter_1.max(dim=1, keepdim=True)
print("shape of the predicted tokens tensor: ", predicted_tokens_iter_1.shape)
print("predicted_tokens: \n", predicted_tokens_iter_1)
print("-" * 150)
print("shape of the max probabilities tensor: ", max_probs_iter_1.shape)
print("max_probs: \n", max_probs_iter_1)

shape of the predicted tokens tensor:  torch.Size([4, 1])
predicted_tokens: 
 tensor([[ 1379],
        [13490],
        [ 6536],
        [27987]])
------------------------------------------------------------------------------------------------------------------------------------------------------
shape of the max probabilities tensor:  torch.Size([4, 1])
max_probs: 
 tensor([[-6.6468],
        [-7.1067],
        [-7.0000],
        [-6.5555]])


In [67]:
updated_tgt_batch_iter_1 = torch.cat([tgt_decoder_input_iter_1, predicted_tokens_iter_1], dim=-1)
print("shape of the updated target batch tensor: ", updated_tgt_batch_iter_1.shape)
print("updated_target_batch: \n", updated_tgt_batch_iter_1)

shape of the updated target batch tensor:  torch.Size([4, 2])
updated_target_batch: 
 tensor([[    0,  1379],
        [    0, 13490],
        [    0,  6536],
        [    0, 27987]])


#### Iteration 2 of the target token prediction.

In [69]:
# Lets not go through all the steps again but lets see how a few additional steps get added when the predicted
# token is end of sentence token.

In [71]:
# Based on the updated_tgt_batch_iter_1, lets create the updated_tgt_batch_iter_2 tensor manually.
updated_tgt_batch_iter_2 = updated_tgt_batch_iter_1.clone()
random_token_ids_iter_2 = torch.tensor(data=[[35], [567], [1], [7684]], dtype=torch.int32)
updated_tgt_batch_iter_2 = torch.cat([updated_tgt_batch_iter_2, random_token_ids_iter_2], dim=-1)
print("shape of the updated target batch tensor: ", updated_tgt_batch_iter_2.shape)
print("updated_target_batch: \n", updated_tgt_batch_iter_2)

shape of the updated target batch tensor:  torch.Size([4, 3])
updated_target_batch: 
 tensor([[    0,  1379,    35],
        [    0, 13490,   567],
        [    0,  6536,     1],
        [    0, 27987,  7684]])


In [None]:
# Now, the next set of token predictions should only run