In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

# Math
import math

# HuggingFace libraries 
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

# Pathlib 
from pathlib import Path

# typing
from typing import Any

# Library for progress bars in loops
from tqdm import tqdm

# Importing library of warnings
import warnings

2025-09-30 08:59:43.889987: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759222784.095171      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759222784.153508      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Architecture

![Image](https://shreyansh26.github.io/assets/img/posts_images/attention/arch.PNG)

# Tokenizer

![Tokenizer](https://i.ytimg.com/vi/hL4ZnAWSyuU/sddefault.jpg)

In [2]:
def build_tokenizer(config, ds, lang):
    
    # Crating a file path for the tokenizer 
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    
    # Checking if Tokenizer already exists
    if not Path.exists(tokenizer_path): 
        
        # If it doesn't exist, we create a new one
        tokenizer = Tokenizer(WordLevel(unk_token = '[UNK]')) # Initializing a new world-level tokenizer
        tokenizer.pre_tokenizer = Whitespace() # We will split the text into tokens based on whitespace
        
        # Creating a trainer for the new tokenizer
        trainer = WordLevelTrainer(special_tokens = ["[UNK]", "[PAD]", 
                                                     "[SOS]", "[EOS]"], min_frequency = 2) # Defining Word Level strategy and special tokens
        
        # Training new tokenizer on sentences from the dataset and language specified 
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer = trainer)
        tokenizer.save(str(tokenizer_path)) # Saving trained tokenizer to the file path specified at the beginning of the function
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path)) # If the tokenizer already exist, we load it
    return tokenizer # Returns the loaded tokenizer or the trained tokenizer

In [3]:
def get_config():
    return{
        'batch_size': 8,
        'num_epochs': 20,
        'lr': 10**-4,
        'seq_len': 350,
        'd_model': 512, # Dimensions of the embeddings in the Transformer. 512 like in the "Attention Is All You Need" paper.
        'lang_src': 'en',
        'lang_tgt': 'it',
        'model_folder': 'weights',
        'model_basename': 'translation_model_',
        'preload': None,
        'tokenizer_file': 'tokenizer_{0}.json',
        'experiment_name': 'runs/translation_model',
        'encoder_layers': 6,
        'decoder_layers': 6,
        'p_drop': 0.1,
        'dff': 2048,
        'n_heads': 8
    }

config = get_config()

In [4]:
def get_all_sentences(ds, lang):
    for pair in ds:
        yield pair['translation'][lang]
        
tokenizer_src = None
tokenizer_tgt = None
train_ds = None
def get_ds(config):
    
    # Loading the train portion of the OpusBooks dataset.
    # The Language pairs will be defined in the 'config' dictionary we will build later
    ds_raw = load_dataset('opus_books', f'{config["lang_src"]}-{config["lang_tgt"]}', split = 'train') 
    
    # Building or loading tokenizer for both the source and target languages 
    tokenizer_src = build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = build_tokenizer(config, ds_raw, config['lang_tgt'])
    
    # Splitting the dataset for training and validation 
    train_ds_size = int(0.9 * len(ds_raw)) # 90% for training
    val_ds_size = len(ds_raw) - train_ds_size # 10% for validation
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size]) # Randomly splitting the dataset
                                    
    # Processing data with the BilingualDataset class, which we will define below
    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
                                    
    # Iterating over the entire dataset and printing the maximum length found in the sentences of both the source and target languages
    max_len_src = 0
    max_len_tgt = 0
    for pair in ds_raw:
        src_ids = tokenizer_src.encode(pair['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_src.encode(pair['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))
        
    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')
    
    # Creating dataloaders for the training and validadion sets
    # Dataloaders are used to iterate over the dataset in batches during training and validation
    train_dataloader = DataLoader(train_ds, batch_size = config['batch_size'], shuffle = True) # Batch size will be defined in the config dictionary
    val_dataloader = DataLoader(val_ds, batch_size = 1, shuffle = True)
    
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt # Returning the DataLoader objects and tokenizers

def casual_mask(size):
        # Creating a square matrix of dimensions 'size x size' filled with ones
        mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int)
        return mask == 0

In [5]:
class BilingualDataset(Dataset):
    
    # This takes in the dataset contaning sentence pairs, the tokenizers for target and source languages, and the strings of source and target languages
    # 'seq_len' defines the sequence length for both languages
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len) -> None:
        super().__init__()
        
        self.seq_len = seq_len
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        
        # Defining special tokens by using the target language tokenizer
        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

        
    # Total number of instances in the dataset (some pairs are larger than others)
    def __len__(self):
        return len(self.ds)
    
    # Using the index to retrive source and target texts
    def __getitem__(self, index: Any) -> Any:
        src_target_pair = self.ds[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]
        
        # Tokenizing source and target texts 
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
        
        # Computing how many padding tokens need to be added to the tokenized texts 
        # Source tokens
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 # Subtracting the two '[EOS]' and '[SOS]' special tokens
        # Target tokens
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # Subtracting the '[SOS]' special token
        
        # If the texts exceed the 'seq_len' allowed, it will raise an error. This means that one of the sentences in the pair is too long to be processed
        # given the current sequence length limit (this will be defined in the config dictionary below)
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError('Sentence is too long')
         
        # Building the encoder input tensor by combining several elements
        encoder_input = torch.cat(
            [
            self.sos_token, # inserting the '[SOS]' token
            torch.tensor(enc_input_tokens, dtype = torch.int64), # Inserting the tokenized source text
            self.eos_token, # Inserting the '[EOS]' token
            torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        )
        
        # Building the decoder input tensor by combining several elements
        decoder_input = torch.cat(
            [
                self.sos_token, # inserting the '[SOS]' token 
                torch.tensor(dec_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        
        )
        
        # Creating a label tensor, the expected output for training the model
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                self.eos_token, # Inserting the '[EOS]' token 
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64) # Adding padding tokens
                
            ]
        )
        
        # Ensuring that the length of each tensor above is equal to the defined 'seq_len'
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len
        
        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input, 
            'encoder_mask': (encoder_input != self.pad_token).unsqueeze(0).int(),
            'decoder_mask': (decoder_input != self.pad_token).unsqueeze(0).int() & (causal_mask(decoder_input.size(0)).squeeze(0)), 
            'label': label,
            'src_text': src_text,
            'tgt_text': tgt_text
        }    

def causal_mask(size):
        # Creating a square matrix of dimensions 'size x size' filled with ones
        mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int)
        return mask == 0

# Token Embedding

In [6]:
class TokenEmbeddings(nn.Module):
    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embeddings = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embeddings(x) * math.sqrt(self.d_model) # Normalizing the variance of the embedding
        

# Positional Encoding

<p style="
    margin-bottom: 5; 
    font-size: 22px;
    font-weight: 300;
    font-family: 'Helvetica Neue', sans-serif;
    color: #000000; 
  ">
    \begin{equation}
    \text{Even Indices } (2i): \quad \text{PE(pos, } 2i) = \sin\left(\frac{\text{pos}}{10000^{2i / d_{model}}}\right)
    \end{equation}
</p>

<p style="
    margin-bottom: 5; 
    font-size: 22px;
    font-weight: 300;
    font-family: 'Helvetica Neue', sans-serif;
    color: #000000; 
  ">
    \begin{equation}
    \text{Odd Indices } (2i + 1): \quad \text{PE(pos, } 2i + 1) = \cos\left(\frac{\text{pos}}{10000^{2i / d_{model}}}\right)
    \end{equation}
</p>

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        encodings = torch.zeros(seq_len, d_model)
        pos = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) # [seq_len, 1]
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        encodings[:, 0::2] = torch.sin(pos * div_term)
        encodings[:, 1::2] = torch.cos(pos * div_term)
        encodings = encodings.unsqueeze(0) # add a batch dimension
        self.register_buffer('encodings', encodings) # Buffer is a tensor not considered as a model parameter


    def forward(self, x):
        return x + (self.encodings[:, :x.shape[1], :]).requires_grad_(False)
        

In [8]:
posm = PositionalEncoding(512, 350)

# Layer Norm

In [9]:
class LayerNorm(nn.Module):
    def __init__(self, eps: float = 1e-9):
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.alpha * (x-mean) / (std+self.eps) + self.bias

# FFW

In [10]:
class FeedForwardBlock(nn.Module):
    
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        # First linear transformation
        self.linear_1 = nn.Linear(d_model, d_ff) # W1 & b1
        self.dropout = nn.Dropout(dropout) # Dropout to prevent overfitting
        # Second linear transformation
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 & b2
        
    def forward(self, x):
        # (Batch, seq_len, d_model) --> (batch, seq_len, d_ff) -->(batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

# Multi Head Attention

<center>
    <img src = "https://i.imgur.com/JqJVrsj.png" width = 1556, height= 959>
<p style = "font-size: 16px;
            font-family: 'Georgia', serif;
            text-align: center;
            margin-top: 10px;"></p>
</center>

In [11]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads: int, d_model: int):
        super().__init__()
        self.n_heads = n_heads
        self.w_key = nn.Linear(d_model, d_model)
        self.w_query = nn.Linear(d_model, d_model)
        self.w_value = nn.Linear(d_model, d_model)
        self.w_out = nn.Linear(d_model, d_model)

    def attention(self, k, q, v, mask):
        d_k = q.shape[-1]
        affinities = (q @ k.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            affinities.masked_fill_(mask == 0, -1e9)
        affinities = affinities.softmax(dim=-1)
        value = affinities @ v
        return value
        

    def forward(self, q, k, v, mask):
        key = self.w_key(k)
        query = self.w_query(q)
        value = self.w_value(v)

        # split embedding dim for each heads
        new_d_model = config['d_model'] // self.n_heads
        k_chunks = torch.split(key, new_d_model, dim=-1)
        q_chunks = torch.split(query, new_d_model, dim=-1)
        v_chunks = torch.split(value, new_d_model, dim=-1)

        output_heads = []
        for i in range(self.n_heads):
            output_heads.append(self.attention(k_chunks[i], q_chunks[i], v_chunks[i], mask))

        concat_out = torch.cat(output_heads, dim=-1)
        return self.w_out(concat_out)
        

In [12]:
class ResidualConnection(nn.Module):
    def __init__(self):
        super().__init__()
        self.layernorm = LayerNorm()

    def forward(self, x, sub_layer):
        return x + sub_layer(self.layernorm(x))

# Encoder
<center>
    <img src = "https://www.researchgate.net/profile/Ehsan-Amjadian/publication/352239001/figure/fig1/AS:1033334390013952@1623377525434/Detailed-view-of-a-transformer-encoder-block-It-first-passes-the-input-through-an.jpg" width = 400, height= 400>
<p style = "font-size: 16px;
            font-family: 'Georgia', serif;
            text-align: center;
            margin-top: 10px;">Encoder block. Source: <a href = "https:///figure/Detailed-view-of-a-transformer-encoder-block-It-first-passes-the-input-through-an_fig1_352239001">researchgate.net</a>.</p>
</center>

In [13]:
class EncoderBlock(nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet_mha = ResidualConnection()
        self.resnet_ffw = ResidualConnection()
        self.mha = MultiHeadAttention(n_heads=config['n_heads'], d_model=config['d_model'])
        self.ffw = FeedForwardBlock(d_model=config['d_model'], d_ff=config['dff'], dropout=config['p_drop'])

    def forward(self, x, src_mask):
        x = self.resnet_mha(x, lambda x : self.mha(x, x, x, src_mask))
        x = self.resnet_ffw(x, lambda x : self.ffw(x))
        return x

In [14]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder_blocks = nn.ModuleList(
            [EncoderBlock() for _ in range(config['encoder_layers'])]
        )
        
    def forward(self, x, src_mask):
        for block in self.encoder_blocks:
            x = block(x, src_mask)
        return x

# Decoder

<center>
    <img src = "https://res.cloudinary.com/edlitera/image/upload/c_fill,f_auto/v1680629118/blog/gz5ccspg3yvq4eo6xhrr" width = 400, height= 400>
<p style = "font-size: 16px;
            font-family: 'Georgia', serif;
            text-align: center;
            margin-top: 10px;"></p>
</center>

In [15]:
class DecoderBlock(nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet_blocks = nn.ModuleList([ResidualConnection() for _ in range(3)])
        self.self_mha = MultiHeadAttention(n_heads=config['n_heads'], d_model=config['d_model'])
        self.cross_mha = MultiHeadAttention(n_heads=config['n_heads'], d_model=config['d_model'])
        self.ffw = FeedForwardBlock(d_model=config['d_model'], d_ff=config['dff'], dropout=config['p_drop'])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.resnet_blocks[0](x, lambda x: self.self_mha(x, x, x, tgt_mask))
        x = self.resnet_blocks[1](x, lambda x: self.cross_mha(x, encoder_output, encoder_output, src_mask))
        x = self.resnet_blocks[2](x, lambda x: self.ffw(x))
        return x

In [16]:
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.decoder_block = nn.ModuleList(
            [DecoderBlock() for _ in range(config['decoder_layers'])]
        )
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.decoder_block:
            x = layer(x, encoder_output, src_mask, tgt_mask)

        return x 

# Transformer

In [17]:
class TranslationTransformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder_embeddings = TokenEmbeddings(config['d_model'], tokenizer_src.get_vocab_size())
        self.encoder = Encoder()
        self.decoder_embeddings = TokenEmbeddings(config['d_model'], tokenizer_tgt.get_vocab_size())
        self.decoder = Decoder()
        self.positional_encodings = PositionalEncoding(config['d_model'], config['seq_len'])
        self.projection = nn.Linear(config['d_model'], tokenizer_tgt.get_vocab_size())

    def encode(self, encoder_inp, src_mask):
        encoder_embeddings = self.encoder_embeddings(encoder_inp)
        encoder_embeddings = self.positional_encodings(encoder_embeddings)
        encoder_output = self.encoder(encoder_embeddings, src_mask)
        return encoder_output

    def decode(self, encoder_output, decoder_inp, src_mask, tgt_mask):
        decoder_embeddings = self.decoder_embeddings(decoder_inp)
        decoder_embeddings = self.positional_encodings(decoder_embeddings)
        decoder_output = self.decoder(decoder_embeddings, encoder_output, src_mask, tgt_mask)
        output = torch.log_softmax(self.projection(decoder_output), dim = -1)
        return output

    def generate(self, encoder_inp, src_mask):
        encoder_output = self.encode(encoder_inp, src_mask)

        sos_idx = tokenizer_tgt.token_to_id('[SOS]')
        eos_idx = tokenizer_tgt.token_to_id('[EOS]')
        decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(encoder_inp).to(device)

        while decoder_input.shape[1] < config['seq_len']:
            decoder_output = self.decode(encoder_output, decoder_input, src_mask, causal_mask(decoder_input.shape[1]).type_as(src_mask))
            output_token = torch.argmax(decoder_output[0, -1, :]) #greedy sampling
            decoder_input = torch.cat([decoder_input, torch.empty(1,1).fill_(output_token).type_as(encoder_inp).to(device)], dim=1)

            if output_token == eos_idx:
                break

        return decoder_input.squeeze(0)    

In [18]:
a = torch.randn(3,4,5)
print(a[0, -1, :])
torch.argmax(a[0, -1, :])

tensor([-0.3509, -0.6808,  0.7474, -0.2319,  0.3486])


tensor(2)

In [19]:
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, writer, num_examples=2):
    model.eval() # Setting model to evaluation mode
    count = 0 # Initializing counter to keep track of how many examples have been processed
    
    console_width = 80 # Fixed witdh for printed messages
    
    # Creating evaluation loop
    with torch.no_grad(): # Ensuring that no gradients are computed during this process
        for batch in validation_ds:
            count += 1
            encoder_input = batch['encoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            
            # Ensuring that the batch_size of the validation set is 1
            assert encoder_input.size(0) ==  1, 'Batch size must be 1 for validation.'
            
            # Applying the 'greedy_decode' function to get the model's output for the source text of the input batch
            model_out = model.generate(encoder_input, encoder_mask)
            
            # Retrieving source and target texts from the batch
            source_text = batch['src_text'][0]
            target_text = batch['tgt_text'][0] # True translation 
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy()) # Decoded, human-readable model output
            
            # Printing results
            print_msg('-'*console_width)
            print_msg(f'SOURCE: {source_text}')
            print_msg(f'TARGET: {target_text}')
            print_msg(f'PREDICTED: {model_out_text}')
            
            # After two examples, we break the loop
            if count == num_examples:
                break

In [20]:
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
device = "cuda" if torch.cuda.is_available() else "cpu"

README.md: 0.00B [00:00, ?B/s]

en-it/train-00000-of-00001.parquet:   0%|          | 0.00/5.73M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32332 [00:00<?, ? examples/s]

Max length of source sentence: 309
Max length of target sentence: 274


In [21]:
data = train_dataloader.dataset.__getitem__(100)
print(data['decoder_mask'].shape)
print(data['encoder_mask'].shape)

torch.Size([350, 350])
torch.Size([1, 350])


In [22]:
model = TranslationTransformer().to(device)
    
# Initialize the parameters
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [23]:
def train_model():
    writer = SummaryWriter(config['experiment_name'])
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps = 1e-9)
    loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer_src.token_to_id('[PAD]'), label_smoothing = 0.1).to(device)

    for epoch in range(config['num_epochs']):
        batch_iterator = tqdm(train_dataloader, desc = f'Processing epoch {epoch:02d}')
        for batch in batch_iterator:
            model.train()
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            decoder_mask = batch['decoder_mask'].to(device)

            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(encoder_output, decoder_input, encoder_mask, decoder_mask)

            label = batch['label'].to(device)
            
            loss = loss_fn(decoder_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            
            # Updating progress bar
            batch_iterator.set_postfix({f"loss": f"{loss.item():6.3f}"})
            
            writer.add_scalar('train loss', loss.item())
            writer.flush()
            
            # Performing backpropagation
            loss.backward()
            
            # Updating parameters based on the gradients
            optimizer.step()
            
            # Clearing the gradients to prepare for the next batch
            optimizer.zero_grad()

        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), writer)
         
        # Writting current model state to the 'model_filename'
        torch.save({
            'epoch': epoch, # Current epoch
            'model_state_dict': model.state_dict(),# Current model state
            'optimizer_state_dict': optimizer.state_dict(), # Current optimizer state
        }, f'checkpoint_{epoch}.pth')    

In [24]:
train_model()

Processing epoch 00: 100%|██████████| 3638/3638 [26:31<00:00,  2.29it/s, loss=5.127]


--------------------------------------------------------------------------------
SOURCE: Karenin, ready to deliver his speech, stood pressing his interlaced fingers together, trying whether some of them would not crack again.
TARGET: Aleksej Aleksandrovic, pronto per il suo discorso, stava in piedi, stringendo le dita incrociate e provando se in qualche giuntura volessero ancora scricchiolare.
PREDICTED: Aleksej Aleksandrovic , in fondo a , , , se non si trovava mai , se non si trovava a far più .
--------------------------------------------------------------------------------
SOURCE: It is not I who am guilty,' he said to himself, 'but it is she. She does not concern me.
TARGET: Non sono io il colpevole — diceva a se stesso — ma lei.
PREDICTED: Non è vero che io sono venuto a conoscere — disse , ma non è vero .


Processing epoch 01: 100%|██████████| 3638/3638 [26:33<00:00,  2.28it/s, loss=4.341]


--------------------------------------------------------------------------------
SOURCE: 'Yes, yes!' he cried in a shrill voice. 'I will take the disgrace, and even give up my son... but... but had we not better let it alone?
TARGET: — Sì, sì — gridò con voce stridula — prenderò su di me il disonore, darò anche mio figlio, ma... non sarebbe meglio? Del resto, fa’ quello che vuoi....
PREDICTED: — Sì , sì — gridò con voce stridula , — io lo , e io non lo , ma non ci , ma non ci .
--------------------------------------------------------------------------------
SOURCE: 'Well, if only because that woman is there, with whom you cannot associate...'
TARGET: — Ma via, non fosse altro che per il fatto che là c’è quella donna di cui tu non puoi fare la conoscenza.
PREDICTED: — E se non è perché , perché non è venuto da chi non può essere ...


Processing epoch 02: 100%|██████████| 3638/3638 [26:34<00:00,  2.28it/s, loss=3.543]


--------------------------------------------------------------------------------
SOURCE: "Georgiana, a more vain and absurd animal than you was certainly never allowed to cumber the earth.
TARGET: — Georgiana, un animale più vano e più stupido di voi, non ha certo avuto mai il diritto di ingombrare la terra.
PREDICTED: — Georgiana , — rispose , — che non vi siete più di .
--------------------------------------------------------------------------------
SOURCE: And if the Venetians and Florentines formerly extended their dominions by these arms, and yet their captains did not make themselves princes, but have defended them, I reply that the Florentines in this case have been favoured by chance, for of the able captains, of whom they might have stood in fear, some have not conquered, some have been opposed, and others have turned their ambitions elsewhere.
TARGET: E, se Viniziani e Fiorentini hanno per lo adrieto cresciuto lo imperio loro con queste arme, e li loro capitani non se ne sono

Processing epoch 03: 100%|██████████| 3638/3638 [26:34<00:00,  2.28it/s, loss=3.277]


--------------------------------------------------------------------------------
SOURCE: But what struck her most was the change that had taken place in Anna, whom she knew and loved.
TARGET: Ma più di tutto la stupiva il cambiamento avvenuto nell’Anna che conosceva e amava.
PREDICTED: Ma che le piaceva più il cambiamento che era Anna , che Anna era innamorata di lei , e lo sapeva .
--------------------------------------------------------------------------------
SOURCE: And to show that all was now well and satisfactory, she slightly opened her mouth, smacked her sticky lips, and drawing them more closely over her old teeth lay still in blissful peace.
TARGET: E a mostrar che ora stava bene, che era contenta, aprì leggermente la bocca, schioccò un po’ con le labbra e, accostate ai vecchi denti le labbra bavose, s’acquietò in una calma beata.
PREDICTED: E per mostrare che era bene , e si aprì , aprì la bocca , le labbra , le labbra , le labbra , i suoi , i suoi baci .


Processing epoch 04: 100%|██████████| 3638/3638 [26:35<00:00,  2.28it/s, loss=2.821]


--------------------------------------------------------------------------------
SOURCE: This is not only ungenerous, but not even gentlemanly – to hit one who is down.'
TARGET: Io non dico che questo sia poco generoso, ma è disonesto percuotere chi è a terra.
PREDICTED: Non solo non solo , ma a cui si farà , la cosa sola .
--------------------------------------------------------------------------------
SOURCE: During the Parliamentary struggle, Reading was besieged by the Earl of Essex, and, a quarter of a century later, the Prince of Orange routed King James's troops there.
TARGET: Durante la lotta parlamentare, Reading fu assediata dal conte di Essex, e, un quarto di secolo più tardi, il principe d’Orange vi sbaragliò le truppe del re Giacomo.
PREDICTED: In genere si , a Reading , il conte , e a un quarto d ’ inverno , il principe chiamò le dimissioni del quale si può avere un certo grado di ammirare le mie forze .


Processing epoch 05: 100%|██████████| 3638/3638 [26:36<00:00,  2.28it/s, loss=2.705]


--------------------------------------------------------------------------------
SOURCE: But granted a prince who has established himself as above, who can command, and is a man of courage, undismayed in adversity, who does not fail in other qualifications, and who, by his resolution and energy, keeps the whole people encouraged—such a one will never find himself deceived in them, and it will be shown that he has laid his foundations well.
TARGET: Ma, sendo uno principe che vi fondi su, che possa comandare e sia uomo di core, né si sbigottisca nelle avversità, e non manchi delle altre preparazioni, e tenga con l’animo et ordini sua animato l’universale, mai si troverrà ingannato da lui, e li parrà avere fatto li sua fondamenti buoni.
PREDICTED: Ma uomo principe principe che ha principe , come uomo che può avere , e che in uno di sopra , el principe che non si vede in altri , che in tutta la sua autorità ha , e in questo si tutto il populo all ’ altro , non si sarà mai buono in una modo

Processing epoch 06: 100%|██████████| 3638/3638 [26:39<00:00,  2.27it/s, loss=2.247]


--------------------------------------------------------------------------------
SOURCE: I took no note of the lapse of time--of the change from morning to noon, from noon to evening.
TARGET: Non mi accorgevo del tempo, non sapevo se era giorno o notte.
PREDICTED: Non ho minor relazione di una giornata , di quel tempo , di trovarvi la sera fino all ' ora .
--------------------------------------------------------------------------------
SOURCE: I feared nothing but interruption, and that came too soon.
TARGET: Temevo soltanto una interruzione, che non tardò.
PREDICTED: Non , ma d ’ essere anche d ’ essere pure .


Processing epoch 07: 100%|██████████| 3638/3638 [26:39<00:00,  2.27it/s, loss=1.953]


--------------------------------------------------------------------------------
SOURCE: The jury all wrote down on their slates, 'She doesn't believe there's an atom of meaning in it,' but none of them attempted to explain the paper.
TARGET: I giurati scrissero tutti sulla lavagna: “Ella non crede che vi sia in esso neppure un atomo di buon senso”.Ma nessuno cercò di spiegare il significato del foglio.
PREDICTED: I giurati si a scrivere delle loro lavagne , non li ha data , ma la possibilità di non trovare nessuno .
--------------------------------------------------------------------------------
SOURCE: Nowadays oats are forty-five kopeks at the inns.
TARGET: Al giorno d’oggi l’avena, dai portieri, sta a quarantacinque copeche.
PREDICTED: ora a quaranta anni , gli apparvero di quaranta .


Processing epoch 08: 100%|██████████| 3638/3638 [26:35<00:00,  2.28it/s, loss=1.941]


--------------------------------------------------------------------------------
SOURCE: If I hadn't woke you, you'd have lain there for the whole fortnight."
TARGET: Se io non ti avessi svegliato, saresti rimasto a letto per tutta la quindicina.
PREDICTED: Se non fossi stata una campana che vi tocchi , che tutti vi sarebbe rimasta tutta la divertirci .
--------------------------------------------------------------------------------
SOURCE: 'Just consider him!
TARGET: — Ma pensa, chi sa come sta?
PREDICTED: — Prima di lui , lo sapete .


Processing epoch 09: 100%|██████████| 3638/3638 [26:37<00:00,  2.28it/s, loss=1.574]


--------------------------------------------------------------------------------
SOURCE: Round the camp-fire in the market-place gather still more of the Barons' troops, and eat and drink deep, and bellow forth roystering drinking songs, and gamble and quarrel as the evening grows and deepens into night.
TARGET: Intorno al fuoco dell’accampamento, in piazza, si raccolgono le altre truppe dei baroni, e mangiano e bevono a più non posso, e muggono canzoni d’orgia, e giuocano e litigano come la sera s’avanza e s’approfondisce nella notte.
PREDICTED: tutt ’ altro che nel luogo dei contadini lo spazio dei baroni , e i , e il bevuto , e , nella notte , e la notte innanzi alle più cupe condizioni .
--------------------------------------------------------------------------------
SOURCE: "No, you are wrong. And now, never mind what I have been: don't trouble your head further about me; but tell me the name of the house where we are."
TARGET: — No, v'ingannate, ma poco importa ciò che facevo, no

Processing epoch 10: 100%|██████████| 3638/3638 [26:35<00:00,  2.28it/s, loss=1.841]


--------------------------------------------------------------------------------
SOURCE: This gave me not only egress and regress, as it was a back way to my tent and to my storehouse, but gave me room to store my goods.
TARGET: Ciò mi diede non solamente una porta di soccorso, per così esprimermi, che mi agevolava l’uscita e l’entrata così nella palizzata come nella grotta, ma un maggiore spazio per allogarvi le cose mie.
PREDICTED: Non solo mi dava fastidio e , quanto era a modo di fare il tenda e la mia tenda per essa andai in mano .
--------------------------------------------------------------------------------
SOURCE: The captain now had no difficulty before him, but to furnish his two boats, stop the breach of one, and man them.
TARGET: Il capitano non era più rattenuto da altri indugi fuor quello di allestire le due scialuppe, ristuccare cioè il forame fatto nell’una, entrambe guarnirle e fornirle d’uomini.
PREDICTED: Non gli altri , non senza , non senza le due barche , di la 

Processing epoch 11: 100%|██████████| 3638/3638 [26:35<00:00,  2.28it/s, loss=1.641]


--------------------------------------------------------------------------------
SOURCE: Kitty was unmarried and ill, and ill for love of the man who had slighted her.
TARGET: Kitty tuttora nubile e malata, malata d’amore per l’uomo che l’aveva disdegnata!
PREDICTED: Kitty non si vergognava e non voleva forse bene il nome di lei .
--------------------------------------------------------------------------------
SOURCE: _Friday_.—Yes, I have been here (points to the NW. side of the island, which, it seems, was their side).
TARGET: — Sì, essermi trovato;» in questa mi accennò il nord-west (maestro) dell’isola che sembra fosse la parte consueta del loro sbarco.
PREDICTED: — Sì , signore , io volessi , principalmente la parte d ’ un lato .


Processing epoch 12: 100%|██████████| 3638/3638 [26:34<00:00,  2.28it/s, loss=1.816]


--------------------------------------------------------------------------------
SOURCE: Friday had not been long gone when he came running back, and flew over my outer wall or fence, like one that felt not the ground or the steps he set his foot on; and before I had time to speak to him he cries out to me, “O master! O master!
TARGET: Venerdì non istette lungo tempo, che tornò addietro tutto ansante, e, scalato il piccolo muro della mia fortezza, corse a me che i suoi piedi non toccavano terra. — «Ah padrone! padrone!
PREDICTED: Venerdì non era ancora . Uno dei due passi in me , e la mia parte si fermarono sul muro , mi sentiva che o l ’ indomani gli alberi s ’ con la sua paura di fare : — « , egli mio padrone !
--------------------------------------------------------------------------------
SOURCE: 'That's none of your business, Two!' said Seven.
TARGET: — Questo non ti riguarda, Due! — rispose Sette.
PREDICTED: — Questo non è affar da loro , — rispose Diana .


Processing epoch 13: 100%|██████████| 3638/3638 [26:38<00:00,  2.28it/s, loss=1.648]


--------------------------------------------------------------------------------
SOURCE: He wants, first of all, to legitimatize his daughter and to be your husband and have a right to you.'
TARGET: Egli vuole, in primo luogo, legittimare sua figlia ed essere tuo marito, aver diritto su di te.
PREDICTED: Prima di tutto , alla figlia , alla vostra figliuola e familiare è tuo marito .
--------------------------------------------------------------------------------
SOURCE: He still slowly moved his finger over his upper lip, and still his eye dwelt dreamily on the glowing grate; thinking it urgent to say something, I asked him presently if he felt any cold draught from the door, which was behind him.
TARGET: Egli continuava ad agitare lentamente un dito sul labbro superiore e l'occhio era fisso sul fuoco. Per rompere il silenzio gli domandai se gli dava noia la porta, che aveva dietro.
PREDICTED: Si sentì il dito nelle sue labbra , senza dubbio quello che si diceva , entrando , e , come s

Processing epoch 14: 100%|██████████| 3638/3638 [26:38<00:00,  2.28it/s, loss=1.676]


--------------------------------------------------------------------------------
SOURCE: Vronsky sat at the head of the table; on his right was the young Governor, a General of the Emperor's suite.
TARGET: Vronskij sedeva a capotavola, alla sua destra sedeva il governatore, generale di corte.
PREDICTED: Vronskij sedeva la testa del salotto , sulla tavola , nel giovane carrozza che portavano al governatore , alquanto imperatore di provincia .
--------------------------------------------------------------------------------
SOURCE: Only he left very early.'
TARGET: Ma è andato via un po’ presto.
PREDICTED: Solo di questo ci rimaneva molto .


Processing epoch 15: 100%|██████████| 3638/3638 [26:35<00:00,  2.28it/s, loss=1.678]


--------------------------------------------------------------------------------
SOURCE: "I should think you ought to be at home yourself," said he, "if you have a home in this neighbourhood: where do you come from?"
TARGET: — Mi pare che anche voi a quest'ora, dovreste essere a casa, se abitate vicino. Di dove venite?
PREDICTED: — Sì , ci dovrebbe essere reso colpevole di casa , — disse , — che in casa dove la poteva entrare ?
--------------------------------------------------------------------------------
SOURCE: They speak of divorce.
TARGET: Mi dicono: il divorzio.
PREDICTED: Esse parlano col divorzio .


Processing epoch 16: 100%|██████████| 3638/3638 [26:36<00:00,  2.28it/s, loss=1.819]


--------------------------------------------------------------------------------
SOURCE: The truth was, we were too clever for them.
TARGET: La verità era che per loro eravamo noi troppo alti.
PREDICTED: Era vero , signora .
--------------------------------------------------------------------------------
SOURCE: Just in the same way Kitty, besides all her cares about linen, bedsores, and cooling drinks, had managed on the very first day to persuade the invalid of the necessity of receiving Communion and Extreme Unction.
TARGET: Katja, proprio alla stessa maniera, oltre tutte le preoccupazioni per la biancheria, per le piaghe, per le bevande, aveva fin dal primo giorno convinto il malato della necessità di comunicarsi e di ricevere l’estrema unzione.
PREDICTED: In realtà , anche in piacevole Kitty , si presentavano veramente per le spalle , il primo segno , che se non avesse finito l ’ abbiamo parlato della necessità di , il malato e l ’ hanno accolta l ’ intera .


Processing epoch 17: 100%|██████████| 3638/3638 [26:37<00:00,  2.28it/s, loss=1.599]


--------------------------------------------------------------------------------
SOURCE: 'There is a way out of every position.
TARGET: — Da qualsiasi situazione c’è una via d’uscita.
PREDICTED: — In qualsiasi situazione è de ’ tentativi di apparire .
--------------------------------------------------------------------------------
SOURCE: The Marshal of the Province, though he felt in the air that there was a plot prepared against him, and though he had not been unanimously asked to stand, had still decided to do so.
TARGET: Il maresciallo del governatorato, malgrado sentisse nell’aria l’inganno preparatogli, e malgrado non tutti l’avessero pregato, decise tuttavia di entrare in ballottaggio.
PREDICTED: Il maresciallo del governatorato era già pieno di cura che , lo sentiva in una aria abbastanza sorte , e pure non aveva neppure a decidersi poi a se stessa .


Processing epoch 18: 100%|██████████| 3638/3638 [26:37<00:00,  2.28it/s, loss=1.591]


--------------------------------------------------------------------------------
SOURCE: When we got back, it was after moonrise: a pony, which we knew to be the surgeon's, was standing at the garden door.
TARGET: Quando giungemmo a casa, la luna era alta. Un cavallo, che riconoscemmo per quello del medico, era legato al cancello del giardino.
PREDICTED: Quando tornammo di là , infatti era un poco seduta , di un singhiozzo che lo spazio del medico .
--------------------------------------------------------------------------------
SOURCE: 'Ah, so you are here!' she said on seeing him. 'Well, how is your poor sister?
TARGET: — Ah, anche voi siete qui — ella disse nel vederlo. — Be’, come va la vostra povera sorella?
PREDICTED: — Ah , sì ; è qui da voi ! — disse ella , come , nel vano della sorella . — come , Dio ?


Processing epoch 19: 100%|██████████| 3638/3638 [26:34<00:00,  2.28it/s, loss=1.672]


--------------------------------------------------------------------------------
SOURCE: Why, the day is already commenced which is to bind us indissolubly; and when we are once united, there shall be no recurrence of these mental terrors: I guarantee that."
TARGET: Il giorno che deve vederci uniti è già incominciato e quando sarete mia, vi assicuro che non avrete più queste paure immaginarie.
PREDICTED: Ma il giorno è già stata fortuna e da parte di quello che sono , e quando sono una volta d ' una volta : un vago che riguardava la sua vita m ' .
--------------------------------------------------------------------------------
SOURCE: Gazing into her face and laughing and shouting unnaturally they again passed by.
TARGET: Di nuovo le passarono accanto, guardandola in viso e gridando fra le risa qualcosa con voce contraffatta.
PREDICTED: il viso e ridendo e molta disinvoltura .
