In [1]:
# This notebook is more of a debugging tool to check the correctness of the model than an informational notebook.
# I have blocks of code that helped me identify specific issues in the model. I am keeping this notebook so that
# it can be used as a reference when trying to debug issues with Deep Learning models in the future.
#
# In this notebook, we do the following:
# 
# 1) We will try to count (manually) the number of parameters in the Machine Translation model that we built in 
#    the .py files. The expected count (from built modules) should be the same as the manual count. Since, we 
#    will train the model using the code written in .py files, we need to be sure that the model is built is 
#    correctly. This is a good way to check that. 
# 2) We will try to create and run the Data Loaders for the Machine Translation model.
# 3) I have realized that the Telugu tokenizer is dividing the sentences into too many tokens. This made me set
#    the vocabulary size for Telugu to a higher value (50000). This still doesn't seem to solve the problem very
#    well, but it's an improvement. This also made me realize that we want to control the vocabulary size for 
#    the English and Telugu separately.
# 
# We did the parameters counting exercise before in 'step_15_machine_translation_model.ipynb' notebook 
# (LINK TO THE NOTEBOOK) before with smaller model using dummy parameters. This time, we will do the same with 
# the actual model that we built in the .py files.

## RUN THE BELOW CELL THAT SETS THE PATH BEFORE RUNNING ANY IMPORTS IN THIS NOTEBOOK

In [1]:
import sys
sys.path.append('../')

In [2]:
from model_implementation.model_building.machine_translation_model import MachineTranslationModel
from model_implementation.utils.constants import ( 
    DROPOUT_PROB, D_FEED_FORWARD, D_MODEL, MAX_INPUT_SEQUENCE_LENGTH, NUM_HEADS, NUM_LAYERS
)

In [4]:
# These sizes depend on our trained tokenizers. For now, I am using these defaults.
SRC_VOCAB_SIZE = 30000
TGT_VOCAB_SIZE = 30000

### Counting Model Parameters

In [None]:
# Identified the following issues:
#
# 1) This block helped me find the bugs associated with the layer connections in the model. I wasn't copying
#    some of the MultiHeadAttention layers properly when passing them to EncoderLayer and DecoderLayer 
#    classes.

In [5]:
translation_model = MachineTranslationModel(d_model=D_MODEL, 
                                            d_feed_forward=D_FEED_FORWARD,
                                            dropout_prob=DROPOUT_PROB, 
                                            num_heads=NUM_HEADS, 
                                            src_vocab_size=SRC_VOCAB_SIZE, 
                                            tgt_vocab_size=TGT_VOCAB_SIZE, 
                                            num_layers=NUM_LAYERS, 
                                            max_seq_len=MAX_INPUT_SEQUENCE_LENGTH)

In [6]:
# Finding out the number of parameters in the build model.
total_params = sum(params.numel() for params in translation_model.parameters())
print("total_params: ", total_params)
total_params_with_grad = sum(params.numel() for params in translation_model.parameters() if params.requires_grad)
print("total_params_with_grad: ", total_params_with_grad)
total_params_without_grad = sum(params.numel() for params in translation_model.parameters() if not params.requires_grad)
print("total_params_without_grad: ", total_params_without_grad)
assert total_params == total_params_with_grad + total_params_without_grad

total_params:  90250544
total_params_with_grad:  90250544
total_params_without_grad:  0


In [7]:
# Prints out all the layers and the number of parameters in each layer.
for name, params in translation_model.named_parameters():
    print(name, " ", params.numel())

src_embedding.look_up_table.weight   15360000
tgt_embedding.look_up_table.weight   15360000
encoder.encoder_layers.0.self_attention.linear_layers.0.weight   262144
encoder.encoder_layers.0.self_attention.linear_layers.0.bias   512
encoder.encoder_layers.0.self_attention.linear_layers.1.weight   262144
encoder.encoder_layers.0.self_attention.linear_layers.1.bias   512
encoder.encoder_layers.0.self_attention.linear_layers.2.weight   262144
encoder.encoder_layers.0.self_attention.linear_layers.2.bias   512
encoder.encoder_layers.0.self_attention.linear_layers.3.weight   262144
encoder.encoder_layers.0.self_attention.linear_layers.3.bias   512
encoder.encoder_layers.0.feed_forward.linear_layer_1.weight   1048576
encoder.encoder_layers.0.feed_forward.linear_layer_1.bias   2048
encoder.encoder_layers.0.feed_forward.linear_layer_2.weight   1048576
encoder.encoder_layers.0.feed_forward.linear_layer_2.bias   512
encoder.encoder_layers.0.sublayer_wrappers.0.layer_norm.weight   512
encoder.encode

In [8]:
# This cell is the same as the one used in 'step_15_machine_translation_model.ipynb' notebook (LINK TO THE 
# NOTEBOOK). 
#
# Lets try to count the number of parameters in the model manually by going through each component and counting 
# parameters. 
# The number of parameters associated with the Embeddings class for src sentences. We have 1 embedding vector 
# per token in the source vocabulary. We have 40000 tokens and each token is represented by an 512-dimensional vector. 
# So, the total number of parameters associated with the Embeddings class for src sentences is 40000 * 512 = 20480000.
num_src_embedding_params = SRC_VOCAB_SIZE * D_MODEL
# The number of parameters associated with the Embeddings class for tgt sentences. We have 1 embedding vector
# per token in the target vocabulary. We have 40000 tokens and each token is represented by an 512-dimensional vector.
# So, the total number of parameters associated with the Embeddings class for tgt sentences is 40000 * 512 = 20480000.
num_tgt_embedding_params = TGT_VOCAB_SIZE * D_MODEL
# There are no parameters associated with the PositionalEncoding class. These are calculated based on a predefined
# formula and are not learned during the training process.
num_positional_encoding_params = 0
# Now, lets calculate the number of parameters associated with the Encoder class. The 'Encoder' class has 6 
# identical EncoderLayers stacked on top of each other. Lets calculate the number of parameters associated with 
# each 'EncoderLayer' class. The EncoderLayer class has MultiHeadedAttention and FeedForwardNN classes as its child 
# classes. Each MultiHeadedAttention class has 4 linear layers (query, key, value and output). Note that a single 
# linear layer is used to calculate the queries, keys, values and outputs for all the heads. So, we don't need to 
# do this calculation for each head separately. Lets take the linear layer associated with the query calculation. 
# The input to this linear layer is a 512-dimensional vector (d_model) and the output is also an 512-dimensional vector 
# (d_model=512). So, the number of parameters in this linear layer associated with the weight matrix is 
# 512 * 512 = 262144. We also have bias terms (d_model=512) associated with this linear layer. So, the total number 
# of parameters associated with the query linear layer is 262144 + 512 = 262656.
num_encoder_query_params = D_MODEL * D_MODEL + D_MODEL
num_encoder_key_params = D_MODEL * D_MODEL + D_MODEL
num_encoder_value_params = D_MODEL * D_MODEL + D_MODEL
num_encoder_attention_output_params = D_MODEL * D_MODEL + D_MODEL
# Now lets calculate the number of parameters associated with FeedForward neural network class in the EncoderLayer. 
# The first linear layer in the feed forward expands the input to a higher dimension (d_model to d_feed_forward). 
# The input to this linear layer is a 512-dimensional vector (d_model) and the output is a 2048-dimensional vector 
# (d_feed_forward). So, the number of parameters in this linear layer associated with the weight matrix is 
# 512 * 2048 = 1048576. We also have bias terms (d_feed_forward=2048) associated with this linear layer. So, the 
# total number of parameters associated with the first linear layer in the feed forward neural network is 
# 1048576 + 2048 = 1050624. The second linear layer in the feed forward neural network compresses the input back 
# to its original dimension (d_feed_forward to d_model). The input to this linear layer is a 2048-dimensional 
# vector (d_feed_forward) and the output is an 512-dimensional vector (d_model). So, the number of parameters in 
# this linear layer associated with the weight matrix is 2048 * 512 = 1048576. We also have bias terms (d_model=512) 
# associated with this linear layer. So, the total number of parameters associated with the second linear layer in 
# the feed forward neural network is 1048576 + 512 = 1049088.
num_encoder_feed_forward_linear_layer_1_params = D_MODEL * D_FEED_FORWARD + D_FEED_FORWARD
num_encoder_feed_forward_linear_layer_2_params = D_FEED_FORWARD * D_MODEL + D_MODEL
# The output of MultiHeadedAttention and FeedForward neural network is normalized using Layer Normalization. Layer
# Normalization is applied along the last dimension of the input tensor (input to Layer Normalization). Each of the
# features is scaled independently with the learned paramaters. So, the number of parameters is the number of 
# features in the last dimension multiplied by 2 (1 parameter for gamma and 1 parameter for beta per feature). Both 
# the output of MultiHeadedAttention and FeedForward neural network have the same size in the last dimension which 
# is 512 (d_model). So, the number of parameters associated with Layer Normalization layer that is applied after 
# MultiHeadedAttention is 512 (gamma) + 512 (beta) = 1024. Similarly, the number of parameters associated with Layer
# Normalization layer that is applied after FeedForward neural network is 512 (gamma) + 512 (beta) = 1024.
num_encoder_attention_layer_norm_params = D_MODEL + D_MODEL
num_encoder_feed_forward_layer_norm_params = D_MODEL + D_MODEL
# The total number of parameters associated with a single EncoderLayer is sum of the above 8 variables = 3152384.
num_encoder_layer_params = num_encoder_query_params + num_encoder_key_params + num_encoder_value_params + num_encoder_attention_output_params + num_encoder_feed_forward_linear_layer_1_params + num_encoder_feed_forward_linear_layer_2_params + num_encoder_attention_layer_norm_params + num_encoder_feed_forward_layer_norm_params
# We also apply Layer Normalization to the output of the last EncoderLayer and pass this as the output of the Encoder.
# This layer again has same number of parameters associated with it as other Layer Normalization layers i.e.,
# 512 (gamma) + 512 (beta) = 1024
num_encoder_layer_layer_norm_params = D_MODEL + D_MODEL
# The transformer model has 6 EncoderLayers stacked on top of each other. So, the total number of parameters 
# associated with Encoder is (6 * 3152384) + 1024 = 18915328
num_total_encoder_params = (NUM_LAYERS * num_encoder_layer_params) + num_encoder_layer_layer_norm_params
# Now, lets calculate the number of parameters associated with the DecoderLayer and the Decoder.
# The method to calculate number of parameters in the DecoderLayer is very similar to how it was done for the 
# EncoderLayer. DecoderLayer just contains 1 additional MultiHeadedAttention Layer (for source attention) and 1
# additional Layer Normalization layer associated with this source attention layer.
# Same as in EncoderLayer ==> Linear Layer parameters ==> 262144 (weights) + 512 (bias) = 262656
num_decoder_self_attention_query_params = D_MODEL * D_MODEL + D_MODEL
num_decoder_self_attention_key_params = D_MODEL * D_MODEL + D_MODEL
num_decoder_self_attention_value_params = D_MODEL * D_MODEL + D_MODEL
num_decoder_self_attention_output_params = D_MODEL * D_MODEL + D_MODEL
# The next 4 variables correspond to the 1 additional MultiHeadedAttention layer (src attention) present in the 
# DecoderLayer.
# Same as in EncoderLayer ==> Linear Layer parameters ==> 262144 (weights) + 512 (bias) = 262656
num_decoder_src_attention_query_params = D_MODEL * D_MODEL + D_MODEL
num_decoder_src_attention_key_params = D_MODEL * D_MODEL + D_MODEL
num_decoder_src_attention_value_params = D_MODEL * D_MODEL + D_MODEL
num_decoder_src_attention_output_params = D_MODEL * D_MODEL + D_MODEL
# The FeedForward neural network is exactly the same as in EncoderLayer.
# Same as in EncoderLayer ==> 512 * 2048 + 2048 = 1050624
num_decoder_feed_forward_linear_layer_1_params = D_MODEL * D_FEED_FORWARD + D_FEED_FORWARD
# Same as in EncoderLayer ==> 2048 * 512 + 512 = 1049088
num_decoder_feed_forward_linear_layer_2_params = D_MODEL * D_FEED_FORWARD + D_MODEL
# We have 1 additional LayerNormalization layer associated with the source attention. However, its architecture and
# the parameters are the same as in EncoderLayer.
# Same as in EncoderLayer ==> 512 (gamma) + 512 (beta) = 1024
num_decoder_self_attention_layer_norm_params = D_MODEL + D_MODEL
num_decoder_src_attention_layer_norm_params = D_MODEL + D_MODEL
num_decoder_feed_forward_layer_norm_params = D_MODEL + D_MODEL
# The total number of parameters associated with a single DecoderLayer is the sum of the above 13 variables = 4204032.
num_decoder_layer_params = num_decoder_self_attention_query_params + num_decoder_self_attention_key_params + num_decoder_self_attention_value_params + num_decoder_self_attention_output_params + num_decoder_src_attention_query_params + num_decoder_src_attention_key_params + num_decoder_src_attention_value_params + num_decoder_src_attention_output_params + num_decoder_feed_forward_linear_layer_1_params + num_decoder_feed_forward_linear_layer_2_params + num_decoder_self_attention_layer_norm_params + num_decoder_src_attention_layer_norm_params + num_decoder_feed_forward_layer_norm_params
# We also apply Layer Normalization to the output of the last DecoderLayer and pass this as the output of the Decoder. 
# This layer again has same number of parameters associated with it as other Layer Normalization layers i.e.,
# 512 (gamma) + 512 (beta) = 1024
num_decoder_layer_layer_norm_params = D_MODEL + D_MODEL
# The transformer model has 6 EncoderLayers stacked on top of each other. So, the total number of parameters 
# associated with Encoder is (6 * 4204032) + 1024 = 25225216.
num_total_decoder_params = (NUM_LAYERS * num_decoder_layer_params) + num_decoder_layer_layer_norm_params
# The output of the decoder is passed to a linear layer that projects the output to the target vocabulary space.
# These parameters are associated with the 'TokenPredictor' layer in the transformer. The input to the linear
# layer are 512-dimensional vectors (d_model) and output of the linear layers are 40000-dimensional vectors (tgt_vocab_size).
# So, the number of parameters associated with the TokenPredictor layer is 512 * 40000 (weights) + 40000 (bias) = 20520000.
num_vocab_projection_params = D_MODEL * TGT_VOCAB_SIZE + TGT_VOCAB_SIZE
# Finally, the total number of parameters in the model is the number of parameters associated with the Embeddings plus
# the number of parameters in the Encoder plus the number of parameters in the Decoder plus the number of parameters
# in the TokenPredictor.
num_total_model_params = num_src_embedding_params + num_tgt_embedding_params + num_total_encoder_params + num_total_decoder_params + num_vocab_projection_params
print("Total Number of parameters associated with the model: ", num_total_model_params)


Total Number of parameters associated with the model:  90250544


In [9]:
# Verify that the number of trainable parameters calculated manually is the same as the number of trainable parameters
# calculated by PyTorch.
assert total_params_with_grad == num_total_model_params

### Loading data from disk and creating DataLoader

In [None]:
# This block helped me identify the following issues:
#
# 1) The tokenizer and the vocabulary created in the 'step_2_training_bpe_tokenizer.ipynb' notebook has '<pad>' 
#    token mapped to token id 3. However, it is supposed to be mapped to 2 in the model created in these '.py' 
#    files.
# 2) I set the limit of MAX_SEQUENCE_LENGTH to 150 in the 'constants.py' file. However, by running the 
#    data loader creation here, I realized the batch lengths are exceeding 150 which would have created issues
#    with Positional Encoding. I have later increased the MAX_SEQUENCE_LENGTH to 200 in the 'constants.py' 
#    file. I also fixed this issue by skipping the batches with length greater than MAX_SEQUENCE_LENGTH in the
#    in the 'model_trainer.py' file.

In [11]:
from model_implementation.data_processing.data_preparation.dataset_wrapper import DatasetWrapper
from model_implementation.data_processing.data_preparation.data_helpers import get_tokenizers, load_data_from_disk
from model_implementation.data_processing.data_preparation.data_loader import create_data_loader
from model_implementation.utils.constants import BATCH_SIZE, DEBUG_DATASET_PATH, FULL_EN_TE_DATASET_PATH

import datasets

In [12]:
# Load the train dataset from disk.
train_dataset: datasets.arrow_dataset.Dataset = load_data_from_disk(dataset_relative_path=DEBUG_DATASET_PATH)
# Wrap the hugging face dataset in a pytorch Dataset to be able to use with pytorch DataLoader.
translation_dataset = DatasetWrapper(hf_dataset=train_dataset)
# Get the tokenizers for the English and Telugu languages.
english_tokenizer, telugu_tokenizer = get_tokenizers(dataset_relative_path=FULL_EN_TE_DATASET_PATH, tokenizer_type="bpe")

In [13]:
for data in translation_dataset:
    src_sentences = data['src']
    tgt_sentences = data['tgt']
    print("src_sentence: ", src_sentences)
    print("tgt_sentence: ", tgt_sentences)
    print("-" * 150)

src_sentence:  Have you heard about Foie gras?
tgt_sentence:  ఇక ఫ్రూట్ ఫ్లైస్ గురించి మీరు విన్నారా?
------------------------------------------------------------------------------------------------------------------------------------------------------
src_sentence:  I never thought of acting in films.
tgt_sentence:  సూర్య సినిమాల్లో నటించాలని ఎప్పుడూ అనుకోలేదు.
------------------------------------------------------------------------------------------------------------------------------------------------------
src_sentence:  Installed Software
tgt_sentence:  స్థాపించబడిన సాఫ్ట్‍వేర్
------------------------------------------------------------------------------------------------------------------------------------------------------
src_sentence:  A case has been registered under Sections 302 and 376, IPC.
tgt_sentence:  నిందితులపై సెక్షన్ 376 మరియు 302ల కింద కేసు నమోదు చేశాం.
----------------------------------------------------------------------------------------------------------------

In [14]:
print(english_tokenizer.encode(text="Chief Minister YS Jagan Mohan Reddy clearly stated that the building was constructed in blatant violation of all laws and regulations, hence it should be demolished."))
print(telugu_tokenizer.encode(text="ఇది నిబంధ‌న‌ల‌కు విరుద్ధంగా నిర్మించిన భ‌వ‌న‌మ‌నీ, అక్ర‌మ క‌ట్ట‌డాల తొల‌గింపు ఇక్క‌డి నుంచీ ప్రారంభం అవుతుందంటూ ముఖ్య‌మంత్రి జ‌గ‌న్మోహ‌న్ రెడ్డి చెప్ప‌డం, ఆయ‌న ఆదేశాల‌కు అనుగుణంగా కూల్చేయ‌డం కూడా జ‌రిగిపోయింది."))

[3130, 508, 2861, 2560, 2383, 812, 7625, 3099, 361, 265, 3103, 358, 5666, 285, 1126, 279, 443, 10665, 289, 479, 4057, 297, 11564, 15, 11015, 371, 755, 306, 20023, 17]
[400, 264, 299, 264, 309, 269, 308, 292, 268, 292, 270, 292, 271, 266, 291, 264, 267, 266, 273, 263, 308, 269, 284, 265, 299, 264, 267, 263, 281, 294, 286, 264, 268, 331, 292, 280, 292, 268, 292, 281, 292, 268, 439, 512, 263, 267, 292, 281, 289, 292, 277, 263, 277, 292, 282, 265, 270, 304, 310, 270, 292, 284, 294, 279, 266, 663, 263, 271, 292, 282, 264, 299, 312, 286, 283, 285, 263, 267, 265, 267, 269, 318, 269, 376, 266, 272, 312, 273, 269, 277, 298, 293, 266, 343, 263, 274, 292, 281, 269, 272, 263, 267, 264, 323, 292, 284, 292, 268, 263, 281, 275, 313, 292, 268, 263, 307, 287, 282, 263, 282, 264, 296, 287, 279, 263, 279, 292, 282, 451, 766, 292, 268, 562, 276, 306, 265, 270, 292, 271, 266, 330, 266, 284, 266, 315, 269, 284, 265, 289, 298, 270, 263, 286, 276, 274, 292, 282, 269, 289, 298, 282, 265, 323, 292, 267, 264, 28

In [15]:
# Create the DataLoader to load the training data.
train_dataloader = create_data_loader(dataset=translation_dataset, 
                                      english_tokenizer=english_tokenizer, 
                                      telugu_tokenizer=telugu_tokenizer, 
                                      num_workers=0, 
                                      batch_size=BATCH_SIZE)

In [16]:
# Every thing seems alright with the data loader.
for src_batch, tgt_batch in train_dataloader:
    print("shape of src_batch: ", src_batch.shape)
    print("src_batch: ", src_batch)
    print("shape of tgt_batch: ", tgt_batch.shape)
    print("tgt_batch: ", tgt_batch)
    print("-" * 150)

shape of src_batch:  torch.Size([64, 16])
src_batch:  tensor([[  674,   550,   464,  ...,     2,     2,     2],
        [ 8347,  1283,   285,  ...,     2,     2,     2],
        [  298,  1651,  1086,  ...,     2,     2,     2],
        ...,
        [  417,   303,  1507,  ...,    17,     2,     2],
        [11850,   777,  5920,  ...,   693,    17,     2],
        [  392,   357,   436,  ...,     2,     2,     2]])
shape of tgt_batch:  torch.Size([64, 63])
tgt_batch:  tensor([[  0, 342, 456,  ...,   2,   2,   2],
        [  0, 272, 408,  ...,   2,   2,   2],
        [  0, 333, 845,  ...,   2,   2,   2],
        ...,
        [  0, 333, 319,  ...,   2,   2,   2],
        [  0, 279, 263,  ..., 273, 301,   1],
        [  0, 379, 307,  ..., 297,   1,   2]])
------------------------------------------------------------------------------------------------------------------------------------------------------
shape of src_batch:  torch.Size([64, 7])
src_batch:  tensor([[ 6758,  4086, 18571,     2,

- ### Find the number of input pairs getting lost because of the maximum sequence length condition
- ### Quick peek at the src and tgt batches to confirm they seem to be grouped and created correctly

In [17]:
from model_implementation.data_processing.data_preparation.dataset_wrapper import DatasetWrapper
from model_implementation.data_processing.data_preparation.data_helpers import get_tokenizers, load_data_from_disk
from model_implementation.data_processing.data_preparation.data_loader import create_data_loader
from model_implementation.utils.constants import BATCH_SIZE, FULL_EN_TE_DATASET_PATH, TRAIN_DATASET_PATH, VALIDATION_DATASET_PATH

import datasets

In [18]:
MAX_NUM_OF_TOKENS_ALLOWED = 150

In [19]:
# Load the train dataset from disk.
train_dataset: datasets.arrow_dataset.Dataset = load_data_from_disk(dataset_relative_path=TRAIN_DATASET_PATH)
# Wrap the hugging face dataset in a pytorch Dataset to be able to use with pytorch DataLoader.
translation_dataset_check_lost_pairs = DatasetWrapper(hf_dataset=train_dataset)

# Load the validation dataset from disk.
validation_dataset: datasets.arrow_dataset.Dataset = load_data_from_disk(dataset_relative_path=VALIDATION_DATASET_PATH)
# Wrap the hugging face dataset in a pytorch Dataset to be able to use with pytorch DataLoader.
validation_dataset_check_lost_pairs = DatasetWrapper(hf_dataset=validation_dataset)

In [20]:
# Get the tokenizers for the English and Telugu languages.
bpe_english_tokenizer, bpe_telugu_tokenizer = get_tokenizers(dataset_relative_path=FULL_EN_TE_DATASET_PATH, tokenizer_type="bpe", retrain_tokenizers=False, max_en_vocab_size=30000, max_te_vocab_size=30000)

In [22]:
# Get the tokenizers for the English and Telugu languages.
spacy_english_tokenizer, spacy_telugu_tokenizer = get_tokenizers(dataset_relative_path=FULL_EN_TE_DATASET_PATH, tokenizer_type="spacy", retrain_tokenizers=True, max_en_vocab_size=30000, max_te_vocab_size=30000)

In [23]:
# Create the DataLoader to load the training data.
train_dataloader_check_lost_pairs = create_data_loader(dataset=translation_dataset_check_lost_pairs, 
                                                       english_tokenizer=bpe_english_tokenizer, 
                                                       telugu_tokenizer=bpe_telugu_tokenizer, 
                                                       num_workers=0, 
                                                       batch_size=BATCH_SIZE)

# Create the DataLoader to load the validation data.
validation_dataloader_check_lost_pairs = create_data_loader(dataset=validation_dataset_check_lost_pairs, 
                                                            english_tokenizer=bpe_english_tokenizer, 
                                                            telugu_tokenizer=bpe_telugu_tokenizer, 
                                                            num_workers=0, 
                                                            batch_size=BATCH_SIZE)


Verify the number of batches getting filtered because of the maximum sequence length allowed.

In [24]:
num_english_bad_batches = 0
num_telugu_bad_batches = 0
num_common_bad_batches = 0
total_num_of_batches = 0

for idx, (src_batch, tgt_batch) in enumerate(validation_dataloader_check_lost_pairs):
    is_common_bad_batch = 0
    total_num_of_batches += 1
    if src_batch.size(1) > MAX_NUM_OF_TOKENS_ALLOWED:
        num_english_bad_batches += 1
        is_common_bad_batch += 1
    if tgt_batch.size(1) > MAX_NUM_OF_TOKENS_ALLOWED:
        num_telugu_bad_batches += 1
        is_common_bad_batch += 1
    if is_common_bad_batch == 2:
        num_common_bad_batches += 1

num_lost_english_translation_pairs = num_english_bad_batches * BATCH_SIZE
num_lost_telugu_translation_pairs = num_telugu_bad_batches * BATCH_SIZE
num_lost_common_translation_pairs = num_common_bad_batches * BATCH_SIZE

In [25]:
print("Number of english bad batches = ", num_english_bad_batches)
print("Number of telugu bad batches = ", num_telugu_bad_batches)
print("Number of common bad batches = ", num_common_bad_batches)
print("Total Number of batches = ", total_num_of_batches)
print("Total number of translation pairs = ", total_num_of_batches * BATCH_SIZE)
print("Number of english translation pairs lost because of tokenization = ", num_lost_english_translation_pairs)
print("Number of telugu translation pairs lost because of tokenization = ", num_lost_telugu_translation_pairs)
print("Number of common translation pairs lost because of tokenization = ", num_lost_common_translation_pairs)

Number of english bad batches =  0
Number of telugu bad batches =  8
Number of common bad batches =  0
Total Number of batches =  79
Total number of translation pairs =  5056
Number of english translation pairs lost because of tokenization =  0
Number of telugu translation pairs lost because of tokenization =  512
Number of common translation pairs lost because of tokenization =  0


Experiment how the token count is changing with different tokenizers and vocab sizes

In [26]:
spacy_en_token_counts = [len(spacy_english_tokenizer.encode(data['src'])) for data in translation_dataset_check_lost_pairs]
spacy_en_token_counts.sort()
print(len(spacy_en_token_counts))
print(spacy_en_token_counts[0], spacy_en_token_counts[-1])
print("-" * 150)

spacy_te_token_counts = [len(spacy_telugu_tokenizer.encode(data['tgt'])) for data in translation_dataset_check_lost_pairs]
spacy_te_token_counts.sort()
print(len(spacy_te_token_counts))
print(spacy_te_token_counts[0], spacy_te_token_counts[-1])
print("-" * 150)

bpe_en_token_counts = [len(bpe_english_tokenizer.encode(data['src'])) for data in translation_dataset_check_lost_pairs]
bpe_en_token_counts.sort()
print(len(bpe_en_token_counts))
print(bpe_en_token_counts[0], bpe_en_token_counts[-1])
print("-" * 150)

bpe_te_token_counts = [len(bpe_telugu_tokenizer.encode(data['tgt'])) for data in translation_dataset_check_lost_pairs]
bpe_te_token_counts.sort()
print(len(bpe_te_token_counts))
print(bpe_te_token_counts[0], bpe_te_token_counts[-1])

250000
1 294
------------------------------------------------------------------------------------------------------------------------------------------------------
250000
1 1038
------------------------------------------------------------------------------------------------------------------------------------------------------
250000
2 449
------------------------------------------------------------------------------------------------------------------------------------------------------
250000
1 4950


In [40]:
def get_count_over_threshold(token_counts: list, threshold: int):
    return len([count for count in token_counts if count > threshold])

In [48]:
print(get_count_over_threshold(spacy_en_token_counts, MAX_NUM_OF_TOKENS_ALLOWED), get_count_over_threshold(spacy_te_token_counts, MAX_NUM_OF_TOKENS_ALLOWED))
print(get_count_over_threshold(bpe_en_token_counts, MAX_NUM_OF_TOKENS_ALLOWED), get_count_over_threshold(bpe_te_token_counts, MAX_NUM_OF_TOKENS_ALLOWED))

9 4
11 1245


In [49]:
# Keeping track of the results from different trained tokenizers.
#
# English:
#     Spacy: 
#         vocab_size: 30000
#         min_num_of_tokens: 1
#         max_num_of_tokens: 294
#         sentences over 100 tokens: 84
#         sentences over 150 tokens: 23
#         sentences over 200 tokens: 9
#     ----------------------------------------------- 
#     BPE:
#         vocab_size: 30000
#         min_num_of_tokens: 2
#         max_num_of_tokens: 449
#         sentences over 100 tokens: 157
#         sentences over 150 tokens: 37
#         sentences over 200 tokens: 16
#     ----------------------------------------------- 
#     Spacy: 
#         vocab_size: 50000
#         min_num_of_tokens: 1
#         max_num_of_tokens: 294
#         sentences over 100 tokens: 84
#         sentences over 150 tokens: 23
#         sentences over 200 tokens: 9
#     ----------------------------------------------- 
#     BPE:
#         vocab_size: 50000
#         min_num_of_tokens: 2
#         max_num_of_tokens: 409
#         sentences over 100 tokens: 129
#         sentences over 150 tokens: 33
#         sentences over 200 tokens: 11
#     ----------------------------------------------- 
#
#
# Telugu
#     Spacy: 
#         vocab_size: 30000
#         min_num_of_tokens: 1
#         max_num_of_tokens: 1038
#         sentences over 100 tokens: 41
#         sentences over 150 tokens: 7
#         sentences over 200 tokens: 4
#     ----------------------------------------------- 
#     BPE:
#         vocab_size: 30000
#         min_num_of_tokens: 1 
#         max_num_of_tokens: 4950
#         sentences over 100 tokens: 15239
#         sentences over 150 tokens: 3862
#         sentences over 200 tokens: 1247
#     ----------------------------------------------- 
#     Spacy: 
#         vocab_size: 50000
#         min_num_of_tokens: 1 
#         max_num_of_tokens: 1038
#         sentences over 100 tokens: 41
#         sentences over 150 tokens: 7
#         sentences over 200 tokens: 4
#     ----------------------------------------------- 
#     BPE:
#         vocab_size: 50000
#         min_num_of_tokens: 1 
#         max_num_of_tokens: 4942
#         sentences over 100 tokens: 15226
#         sentences over 150 tokens: 3861
#         sentences over 200 tokens: 1245
#
#
# I have the following insights from the above experimental data:
#
# 1) Spacy tokenizer seems a good choice from the perspective of the number of tokens.
#       -- The number of tokens in telugu is considerable less spacy when compared to bpe.
#       -- Less number of tokens for the same sentence is always good because the model has to make associations over less number of tokens. 
# 2) BPE tokenizer performance doesn't seem to change much when the vocab_size is increased from 30000 to 50000.
#       -- It is good to use max vocab size of 30000 for Telugu in case BPE tokenizer is used for the model training.

Count the number of unknown tokens after the tokenization with different tokenizers

In [11]:
# Counting the number of unknown tokens when tokenized with different tokenizers.
# The 'unk' token is represented by the token id 3 in the tokenized sentences.
# bpe tokenizer should not have any 'unk' tokens because it is a byte level tokenizer.
spacy_en_unk_token_count = 0
for data_point in translation_dataset_check_lost_pairs:
    spacy_en_unk_token_count += spacy_english_tokenizer.encode(data_point['src']).count(3) 
print("Number of 'unk' tokens in spacy english tokenizer: ", spacy_en_unk_token_count)
print("-" * 150)

spacy_te_unk_token_count = 0
for data_point in translation_dataset_check_lost_pairs:
    spacy_te_unk_token_count += spacy_telugu_tokenizer.encode(data_point['tgt']).count(3)
print("Number of 'unk' tokens in spacy telugu tokenizer: ", spacy_te_unk_token_count)
print("-" * 150)

bpe_en_unk_token_count = 0
for data_point in translation_dataset_check_lost_pairs:
    bpe_en_unk_token_count += bpe_english_tokenizer.encode(data_point['src']).count(3)
print("Number of 'unk' tokens in bpe english tokenizer: ", bpe_en_unk_token_count)
print("-" * 150)

bpe_te_unk_token_count = 0
for data_point in translation_dataset_check_lost_pairs:
    bpe_te_unk_token_count += bpe_telugu_tokenizer.encode(data_point['tgt']).count(3)
print("Number of 'unk' tokens in bpe telugu tokenizer: ", bpe_te_unk_token_count)
print("-" * 150)

Number of 'unk' tokens in spacy english tokenizer:  70262
------------------------------------------------------------------------------------------------------------------------------------------------------
Number of 'unk' tokens in spacy telugu tokenizer:  329344
------------------------------------------------------------------------------------------------------------------------------------------------------
Number of 'unk' tokens in bpe english tokenizer:  0
------------------------------------------------------------------------------------------------------------------------------------------------------
Number of 'unk' tokens in bpe telugu tokenizer:  0
------------------------------------------------------------------------------------------------------------------------------------------------------


In [12]:
total_spacy_en_token_count = 0
total_spacy_te_token_count = 0
total_bpe_en_token_count = 0
total_bpe_te_token_count = 0
for data_point in translation_dataset_check_lost_pairs:
    total_spacy_en_token_count += len(spacy_english_tokenizer.encode(data_point['src']))
    total_spacy_te_token_count += len(spacy_telugu_tokenizer.encode(data_point['tgt']))
    total_bpe_en_token_count += len(bpe_english_tokenizer.encode(data_point['src']))
    total_bpe_te_token_count += len(bpe_telugu_tokenizer.encode(data_point['tgt']))


print(f"Total number of source tokens in the training set using spacy tokenizer: {total_spacy_en_token_count}")
print(f"Total number of target tokens in the training set using spacy tokenizer: {total_spacy_te_token_count}")
print(f"Total number of source tokens in the training set using bpe tokenizer: {total_bpe_en_token_count}")
print(f"Total number of target tokens in the training set using bpe tokenizer: {total_bpe_te_token_count}")

Total number of source tokens in the training set using spacy tokenizer: 2778620
Total number of target tokens in the training set using spacy tokenizer: 2182075
Total number of source tokens in the training set using bpe tokenizer: 2959244
Total number of target tokens in the training set using bpe tokenizer: 10683302


Manually observe some data output by the dataloader to confirm that batches are being formed correctly

In [27]:
# Lets just iterate through a few batches observe the source and target batch sequence lengths.
for idx, (src_batch, tgt_batch) in enumerate(train_dataloader_check_lost_pairs):
    if idx > 20:
        break
    print("(src_batch sequence length: ", src_batch.shape[1], ", tgt_batch sequence length: ", tgt_batch.shape[1])

(src_batch sequence length:  41 , tgt_batch sequence length:  211
(src_batch sequence length:  13 , tgt_batch sequence length:  52
(src_batch sequence length:  9 , tgt_batch sequence length:  48
(src_batch sequence length:  20 , tgt_batch sequence length:  98
(src_batch sequence length:  14 , tgt_batch sequence length:  61
(src_batch sequence length:  33 , tgt_batch sequence length:  105
(src_batch sequence length:  25 , tgt_batch sequence length:  69
(src_batch sequence length:  9 , tgt_batch sequence length:  37
(src_batch sequence length:  36 , tgt_batch sequence length:  127
(src_batch sequence length:  6 , tgt_batch sequence length:  32
(src_batch sequence length:  11 , tgt_batch sequence length:  49
(src_batch sequence length:  9 , tgt_batch sequence length:  46
(src_batch sequence length:  65 , tgt_batch sequence length:  179
(src_batch sequence length:  14 , tgt_batch sequence length:  65
(src_batch sequence length:  22 , tgt_batch sequence length:  84
(src_batch sequence lengt

In [28]:
# Lets just iterate through a few batches observe the source and target batches.
for idx, (src_batch, tgt_batch) in enumerate(train_dataloader_check_lost_pairs):
    if idx > 5:
        break
    print("(src_batch: \n", src_batch)
    print("-" * 150)
    print("(tgt_batch: \n", tgt_batch)
    print("-" * 150)

(src_batch: 
 tensor([[23349,   347,  2935,   679,     2,     2,     2,     2],
        [  531,   303, 25670,  1282,    34,     2,     2,     2],
        [18007,    86,   646,  2912,     2,     2,     2,     2],
        [  527,   303,   347,  2340,    34,     2,     2,     2],
        [ 5952,   769,   724,    17,     2,     2,     2,     2],
        [ 2437,   464,   608,   815,    34,     2,     2,     2],
        [20150,  1353,  1481,    17,     2,     2,     2,     2],
        [ 4493,   396,   394,    17,     2,     2,     2,     2],
        [ 5983,  1837,  5748,     2,     2,     2,     2,     2],
        [ 1850,  5103,  1121,    17,     2,     2,     2,     2],
        [  754,   303,   868,   371,  7098,    17,     2,     2],
        [ 2797,    15, 13173,   445,   787,    17,     2,     2],
        [  392,  1481,  1574,     2,     2,     2,     2,     2],
        [  527,   351,   857, 22265,    34,     2,     2,     2],
        [ 1350,   262,   839,  4162,    17,     2,     2,     

### Inspect the saved model state dict to see that all the parameters are being saved

In [29]:
from model_implementation.data_processing.data_preparation.data_helpers import get_tokenizers
from model_implementation.model_building.machine_translation_model import MachineTranslationModel
from model_implementation.utils.constants import ( 
    DROPOUT_PROB, D_FEED_FORWARD, D_MODEL, FULL_EN_TE_DATASET_PATH, 
    MAX_INPUT_SEQUENCE_LENGTH, NUM_HEADS, NUM_LAYERS
)

In [30]:
english_tokenizer, telugu_tokenizer = get_tokenizers(dataset_relative_path=FULL_EN_TE_DATASET_PATH, tokenizer_type="bpe", retrain_tokenizers=False)

In [31]:
src_vocab_size=english_tokenizer.get_vocab_size() 
tgt_vocab_size=telugu_tokenizer.get_vocab_size()
print("src_vocab_size: ", src_vocab_size)
print("tgt_vocab_size: ", tgt_vocab_size)

src_vocab_size:  30000
tgt_vocab_size:  30000


In [32]:
translation_model = MachineTranslationModel(d_model=D_MODEL, 
                                            d_feed_forward=D_FEED_FORWARD,
                                            dropout_prob=DROPOUT_PROB, 
                                            num_heads=NUM_HEADS, 
                                            src_vocab_size=src_vocab_size, 
                                            tgt_vocab_size=tgt_vocab_size, 
                                            num_layers=NUM_LAYERS, 
                                            max_seq_len=MAX_INPUT_SEQUENCE_LENGTH)

In [33]:
print(type(translation_model.state_dict()))

<class 'collections.OrderedDict'>


In [34]:
for key, value in translation_model.state_dict().items():
    print(key, " ", value.shape)

src_embedding.look_up_table.weight   torch.Size([30000, 512])
tgt_embedding.look_up_table.weight   torch.Size([30000, 512])
src_positional_encoding.positional_encoding   torch.Size([1, 150, 512])
tgt_positional_encoding.positional_encoding   torch.Size([1, 150, 512])
encoder.encoder_layers.0.self_attention.linear_layers.0.weight   torch.Size([512, 512])
encoder.encoder_layers.0.self_attention.linear_layers.0.bias   torch.Size([512])
encoder.encoder_layers.0.self_attention.linear_layers.1.weight   torch.Size([512, 512])
encoder.encoder_layers.0.self_attention.linear_layers.1.bias   torch.Size([512])
encoder.encoder_layers.0.self_attention.linear_layers.2.weight   torch.Size([512, 512])
encoder.encoder_layers.0.self_attention.linear_layers.2.bias   torch.Size([512])
encoder.encoder_layers.0.self_attention.linear_layers.3.weight   torch.Size([512, 512])
encoder.encoder_layers.0.self_attention.linear_layers.3.bias   torch.Size([512])
encoder.encoder_layers.0.feed_forward.linear_layer_1.wei

### Verify the correctness of Inference via Greedy Search

In [3]:
from model_implementation.data_processing.data_preparation.data_helpers import get_tokenizers
from model_implementation.model_building.machine_translation_model import MachineTranslationModel
from model_implementation.model_inference.translator import translate
from model_implementation.utils.constants import ( 
    DROPOUT_PROB, D_FEED_FORWARD, D_MODEL, FULL_EN_TE_DATASET_PATH, MAX_INPUT_SEQUENCE_LENGTH, NUM_HEADS, NUM_LAYERS
)
from model_implementation.utils.helpers import load_model_from_disk

In [4]:
def load_translation_model_from_disk(model_name: str, 
                                     src_vocab_size: int, 
                                     tgt_vocab_size: int, 
                                     checkpoint_prefix: str="") -> MachineTranslationModel:
    """Loads the trained translation model from disk.

    Args:
        model_name (str): Name of the model to load from disk.

    Returns:
        MachineTranslationModel: Returns the trained machine translation model.
    """
    translation_model = MachineTranslationModel(d_model=D_MODEL, 
                                                d_feed_forward=D_FEED_FORWARD,
                                                dropout_prob=DROPOUT_PROB, 
                                                num_heads=NUM_HEADS, 
                                                src_vocab_size=src_vocab_size, 
                                                tgt_vocab_size=tgt_vocab_size, 
                                                num_layers=NUM_LAYERS, 
                                                max_seq_len=MAX_INPUT_SEQUENCE_LENGTH)
    load_model_from_disk(model=translation_model, model_name=model_name, checkpoint_prefix=checkpoint_prefix)
    return translation_model

In [5]:
translation_model = load_translation_model_from_disk(model_name="bpe_large_train_dataset", 
                                                     checkpoint_prefix="colab_run_2_epoch_14", 
                                                     src_vocab_size=30000, 
                                                     tgt_vocab_size=30000)
translation_model.eval()

MachineTranslationModel(
  (src_embedding): Embeddings(
    (look_up_table): Embedding(30000, 512)
  )
  (tgt_embedding): Embeddings(
    (look_up_table): Embedding(30000, 512)
  )
  (src_positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (tgt_positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (encoder_layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_attention): MultiHeadedAttention(
          (dropout_layer): Dropout(p=0.1, inplace=False)
          (linear_layers): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
        )
        (feed_forward): FeedForwardNN(
          (linear_layer_1): Linear(in_features=512, out_features=2048, bias=True)
          (linear_layer_2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout_layer): Dropout(p=0.1, inplace=False)
        )
        (sublayer_wrappe

In [6]:
english_tokenizer, telugu_tokenizer = get_tokenizers(dataset_relative_path=FULL_EN_TE_DATASET_PATH, 
                                                     tokenizer_type="bpe", 
                                                     retrain_tokenizers=False)

In [7]:
src_sentences_set_1 = ["I am a Software Engineer at Google.", 
                       "How do I learn Machine Learing and start working on awesome ideas?", 
                       "Lets do a Masters in Data Science at good university.", 
                       "I watched The Boys tv show last week. It was awesome"]

src_sentences_set_2 = ["Have you heard about Foie gras?",
                       "I never thought of acting in films.",
                       "Installed Software",
                       "A case has been registered under Sections 302 and 376, IPC.",
                       "Of this, 10 people succumbed to the injuries."]

src_sentences_set_3 = ["Have you heard about Foie gras?",
                       "I never thought of acting in films.",
                       "A case has been registered under Sections 302 and 376, IPC.",]

In [8]:
translated_sentences = translate(translation_model=translation_model, 
                                 src_tokenizer=english_tokenizer, 
                                 tgt_tokenizer=telugu_tokenizer, 
                                 src_sentences=src_sentences_set_3, 
                                 beam_size=3,
                                 search_type="beam",
                                 device="cpu")

tokenized_src_sequences: [[3444, 453, 3345, 624, 511, 82, 483, 1033, 295, 34], [44, 1536, 2621, 289, 4851, 285, 1268, 17], [36, 652, 357, 436, 964, 632, 18974, 18478, 297, 20849, 15, 7811, 17]]
src_batch: tensor([[ 3444,   453,  3345,   624,   511,    82,   483,  1033,   295,    34,
             2,     2,     2],
        [   44,  1536,  2621,   289,  4851,   285,  1268,    17,     2,     2,
             2,     2,     2],
        [   36,   652,   357,   436,   964,   632, 18974, 18478,   297, 20849,
            15,  7811,    17]], dtype=torch.int32)
------------------------------------------------------------------------------------------------------------------------------------------------------
src_mask: tensor([[[[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           False, False, False]]],


        [[[ True,  True,  True,  True,  True,  True,  True,  True, False, False,
           False, False, False]]],


        [[[ True,  True,  True,  True,  True,  T

In [9]:
print(len(translated_sentences))
print(translated_sentences)

3
['రలరల', 'రలరల', 'రలరల']


### Debug by running the entire training script on debug dataset.

In [2]:
%run ../model_implementation/model_training/training_script_main.py --model_name "bpe_30k_en-te-model_debug" --model_checkpoint_prefix "test_run_1" --device "cuda" --tokenizer_type "bpe"

  from .autonotebook import tqdm as notebook_tqdm
[2024-11-18 00:31:54,369 -- __main__ -- INFO -- Training the translation model with the following arguments: Namespace(model_checkpoint_prefix='test_run_1', model_name='bpe_30k_en-te-model_debug', device='cuda', tokenizer_type='bpe', max_english_vocab_size=30000, max_telugu_vocab_size=30000, retrain_tokenizers=False, resume_training=False)]
[2024-11-18 00:31:54,371 -- __main__ -- INFO -- Loading the following dataset for training: Data/AI4Bharat/debug_dataset]
[2024-11-18 00:31:55,065 -- model_implementation.data_processing.data_preparation.data_helpers -- INFO -- Loading pre-trained tokenizers from disk]
[2024-11-18 00:31:55,259 -- model_implementation.model_training.model_trainer -- INFO -- Source (English) vocabulary size: 30000]
[2024-11-18 00:31:55,260 -- model_implementation.model_training.model_trainer -- INFO -- Target (Telugu) vocabulary size: 30000]
[2024-11-18 00:31:56,657 -- model_implementation.data_processing.data_preparat

------------------------------------------------------------------------------------------------------------------------------------------------------


[2024-11-18 00:31:58,941 -- model_implementation.model_training.model_trainer -- INFO -- Approximate remaining training time in minutes: 0.02288533846537272]
Training the model on epochs:  50%|█████     | 1/2 [00:02<00:02,  2.27s/it][2024-11-18 00:31:59,136 -- model_implementation.model_training.model_trainer -- INFO -- Processing batch number: 0]
[2024-11-18 00:31:59,186 -- model_implementation.model_training.model_trainer -- INFO -- shape of expected_tgt_probability_distributions: torch.Size([64, 131, 30000])]
[2024-11-18 00:31:59,249 -- model_implementation.model_training.model_trainer -- INFO -- expected_tgt_probability_distributions: tensor([[[3.3336e-06, 3.3336e-06, 0.0000e+00,  ..., 3.3336e-06,
          3.3336e-06, 3.3336e-06],
         [3.3336e-06, 3.3336e-06, 0.0000e+00,  ..., 3.3336e-06,
          3.3336e-06, 3.3336e-06],
         [3.3336e-06, 3.3336e-06, 0.0000e+00,  ..., 3.3336e-06,
          3.3336e-06, 3.3336e-06],
         ...,
         [0.0000e+00, 0.0000e+00, 0.0000e+

------------------------------------------------------------------------------------------------------------------------------------------------------


[2024-11-18 00:32:00,513 -- model_implementation.model_training.model_trainer -- INFO -- Approximate remaining training time in minutes: 0.0]
Training the model on epochs: 100%|██████████| 2/2 [00:03<00:00,  1.92s/it]
[2024-11-18 00:32:00,924 -- __main__ -- INFO -- Model training completed in 0.10921062231063842 minutes.]
[2024-11-18 00:32:00,925 -- model_implementation.utils.state_holders -- INFO -- Device used for training: cuda]
[2024-11-18 00:32:00,925 -- model_implementation.utils.state_holders -- INFO -- Epoch: 0]
[2024-11-18 00:32:00,926 -- model_implementation.utils.state_holders -- INFO -- Training Time (in minutes): 0.02288533846537272]
[2024-11-18 00:32:00,927 -- model_implementation.utils.state_holders -- INFO -- Epoch start learning Rate: 1.746928107421711e-07]
[2024-11-18 00:32:00,927 -- model_implementation.utils.state_holders -- INFO -- Epoch end learning Rate: 5.240784322265133e-07]
[2024-11-18 00:32:00,928 -- model_implementation.utils.state_holders -- INFO -- Trainin

------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
