# Import Dependencies

# Pytorch Transformer Model from scratch

In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()


import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data



In [29]:
df = pd.read_csv('D:/College/5th Semester/NLP\Assignmnet 1/translation_train.csv')
df_test=pd.read_csv('D:/College/5th Semester/NLP/Assignmnet 1/translation_test.csv')

In [3]:
print(df.shape)
print(df.isnull().sum())
print(df.duplicated().sum())

(29000, 2)
english    0
german     0
dtype: int64
3


In [None]:
print(df_test.shape)
print(df_test.isnull().sum())
print(df_test.duplicated().sum())

(1000, 2)
english    0
german     0
dtype: int64
0


In [5]:
print(df.isnull().sum())
print(df.duplicated().sum())

english    0
german     0
dtype: int64
3


In [6]:
df.drop_duplicates(inplace=True)

In [7]:
for i in range(15):
    print(df['german'][i+1])
    print(df['english'][i+1])

Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.
Several men in hard hats are operating a giant pulley system.
Ein kleines Mädchen klettert in ein Spielhaus aus Holz.
A little girl climbing into a wooden playhouse.
Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster.
A man in a blue shirt is standing on a ladder cleaning a window.
Zwei Männer stehen am Herd und bereiten Essen zu.
Two men are at the stove preparing food.
Ein Mann in grün hält eine Gitarre, während der andere Mann sein Hemd ansieht.
A man in green holds a guitar while the other man observes his shirt.
Ein Mann lächelt einen ausgestopften Löwen an.
A man is smiling at a stuffed lion
Ein schickes Mädchen spricht mit dem Handy während sie langsam die Straße entlangschwebt.
A trendy girl talking on her cellphone while gliding slowly down the street.
Eine Frau mit einer großen Geldbörse geht an einem Tor vorbei.
A woman with a large purse is walking by a gate.
Jungen tanzen mitten in der 

In [8]:
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

num_sentences = len(df)
num_train = int(train_ratio * num_sentences)
num_val = int(val_ratio * num_sentences)
num_test = num_sentences - num_train - num_val
# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

In [9]:
train_df = df[:num_train]
val_df = df[num_train:num_train+num_val]
test_df = df_test[:num_test]

In [10]:
len(train_df)

23197

In [11]:
len(val_df)

2899

# Tokenization

In [13]:
import sentencepiece as spm

# Train a SentencePiece tokenizer
def train_sentencepiece(corpus, model_prefix, vocab_size=7500):  # Adjust vocab_size
    with open(f"{model_prefix}_corpus.txt", "w") as f:
        f.write("\n".join(corpus))
    
    spm.SentencePieceTrainer.train(
        input=f"{model_prefix}_corpus.txt",
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        character_coverage=0.9995,  # Adjust for language diversity
        pad_id=0, unk_id=1, bos_id=2, eos_id=3
    )
    print(f"SentencePiece model trained for {model_prefix}")


# Train tokenizers for English and German
train_sentencepiece(train_df['english'], model_prefix="english")
train_sentencepiece(train_df['german'], model_prefix="german")

# Load SentencePiece models
en_tokenizer = spm.SentencePieceProcessor(model_file="english.model")
ger_tokenizer = spm.SentencePieceProcessor(model_file="german.model")

# Vocabulary sizes
src_vocab_size = en_tokenizer.get_piece_size()
tgt_vocab_size = ger_tokenizer.get_piece_size()
print(f"Source (English) vocab size: {src_vocab_size}")
print(f"Target (German) vocab size: {tgt_vocab_size}")

# Tokenizing function with special tokens
def tokenize_sentence_with_specials(sentence, tokenizer):
    tokens = tokenizer.encode(sentence, out_type=int)  # Convert sentence to subword token IDs
    tokens = [tokenizer.bos_id()] + tokens + [tokenizer.eos_id()]  # Add BOS and EOS
    return tokens

# Tokenizing training data
print("Tokenizing training data:")
train_en_tokens = [tokenize_sentence_with_specials(sent, en_tokenizer) for sent in train_df['english']]
train_gr_tokens = [tokenize_sentence_with_specials(sent, ger_tokenizer) for sent in train_df['german']]
print("Training Data tokenized")

# Tokenizing validation data
print("Tokenizing validation data:")
val_en_tokens = [tokenize_sentence_with_specials(sent, en_tokenizer) for sent in val_df['english']]
val_gr_tokens = [tokenize_sentence_with_specials(sent, ger_tokenizer) for sent in val_df['german']]
print("Validation Data tokenized")

# Tokenizing validation data
print("Tokenizing Test data:")
test_en_tokens = [tokenize_sentence_with_specials(sent, en_tokenizer) for sent in test_df['english']]
test_gr_tokens = [tokenize_sentence_with_specials(sent, ger_tokenizer) for sent in test_df['german']]
print("Test Data tokenized")


SentencePiece model trained for english
SentencePiece model trained for german
Source (English) vocab size: 7500
Target (German) vocab size: 7500
Tokenizing training data:
Training Data tokenized
Tokenizing validation data:
Validation Data tokenized
Tokenizing testing data:
Testing Data tokenized


#  Dataset and Data Loader

In [14]:
class TranslationDataset(data.Dataset):
    def __init__(self, en_tokens, ger_tokens):
        self.en_tokens = en_tokens
        self.ger_tokens = ger_tokens
        self.max_len = max(max(len(en), len(gr)) for en, gr in zip(en_tokens, ger_tokens))
        
    def __len__(self):
        return len(self.en_tokens)
    
    def __getitem__(self, index):
        en_data = self.en_tokens[index] + [0] * (self.max_len - len(self.en_tokens[index]))  # Padding with 0
        ger_data = self.ger_tokens[index] + [0] * (self.max_len - len(self.ger_tokens[index]))  # Padding with 0
        return torch.tensor(en_data), torch.tensor(ger_data)

In [15]:
train_dataset = TranslationDataset(train_en_tokens, train_gr_tokens)
val_dataset = TranslationDataset(val_en_tokens, val_gr_tokens)
test_dataset=TranslationDataset(test_en_tokens,test_gr_tokens)

# Create data loaders
train_loader = data.DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=2)
test_loader=data.DataLoader(test_dataset,batch_size=2)
print("Dataset and Dataloders created")

Dataset and Dataloders created


In [16]:
# Print the first batch of data from train_loader
for batch_idx, (en_data, ger_data) in enumerate(train_loader):
    print("English Data (Batch 0):", en_data)
    print("German Data (Batch 0):", ger_data)
    break  # Break after printing the first batch to avoid printing the entire dataset


English Data (Batch 0): tensor([[   2,    6,   19,   25,    4,  220,   90,  104,   79,   21,    4,  212,
            5,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0],
        [   2,    6,  172,  129,   16,    4,  214,  119,  507,  379,  180,   15,
           80,   60, 1268,    5,    3,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]])
German Data (Batch 0): tensor([[   2,   19,   22,    9, 1156,   30,  114,

# Positional Encoding

In [17]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [18]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        # Frequency Scaling mechanism 
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Attention Layer

In [19]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, debug_str=None):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.debug_str = debug_str

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)

        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()

        
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, num_heads, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
         
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        attn_output = attn_output.transpose(1, 2).contiguous().view(Q.size(0), -1, self.d_model)
        output = self.W_o(attn_output)
        return output

# Encoder Layer

In [20]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

# Decoder

In [21]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)#, debug_str="cross")
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

# Full Transformer Model

In [22]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, pad_token_src=0, pad_token_tgt=0, device='cpu'):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        
        self.pad_token_src = pad_token_src
        self.pad_token_tgt = pad_token_tgt
        self.device = device
        self.to(self.device)
        
        # Define the encoder and decoder layers
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
    def generate_mask(self, src_mask, tgt_mask):
     # Ensure the src_mask is correctly shaped (batch_size, 1, 1, seq_len)
       src_mask = src_mask.unsqueeze(1).unsqueeze(2)
    
    # Ensure the tgt_mask is correctly shaped (batch_size, 1, seq_len, seq_len)
       tgt_mask = tgt_mask.unsqueeze(1).unsqueeze(3)
    
    # Sequence length for nopeak_mask
       seq_length = tgt_mask.size(2)
    
    # Generate lower triangular no-peak mask (1 for allowed, 0 for masked)
       nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()

    # Apply the no-peak mask to the target mask
       tgt_mask = tgt_mask & nopeak_mask.to(self.device)

       return src_mask, tgt_mask

    
    def decode(self, src, bos_token_id, eos_token_id, mask=None, max_dec_length=50):
     # Initialize the target tensor with the BOS token
      tgt = torch.tensor([[bos_token_id]] * src.shape[0]).to(self.device)

    # Initialize src_mask and tgt_mask if mask is provided, else generate them
      if mask:
         src_mask = mask.get('src_mask')
         tgt_mask = mask.get('tgt_mask', torch.ones((tgt.size(0), tgt.size(1), tgt.size(1)), device=self.device).bool())
      else:
        # Generate default masks if no mask is provided
          src_mask = src != self.pad_token_src
          tgt_mask = torch.ones((tgt.size(0), tgt.size(1), tgt.size(1)), device=self.device).bool()

    # Now apply the masks (src_mask and tgt_mask) to the model
      src_mask, tgt_mask = self.generate_mask(src_mask, tgt_mask)

      for _ in range(max_dec_length):
        # Forward pass through the model with the masks applied
          logits = self.forward(src, tgt, {'src_mask': src_mask, 'tgt_mask': tgt_mask})
          next_token = logits.argmax(-1)[:, -1].unsqueeze(1)

        # Add the predicted token to the target sequence
          tgt = torch.cat([tgt, next_token], dim=1)

        # Break if the EOS token is generated
          if torch.any(next_token == eos_token_id):
             break

          return tgt


    def forward(self, src, tgt, mask=None):
        if mask:
            src_mask, tgt_mask = self.generate_mask(mask['src_mask'], mask['tgt_mask'])
        else:
            src_mask, tgt_mask = self.generate_mask(src != self.pad_token_src, tgt != self.pad_token_tgt)
        
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output


In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#Hyperparameters
# Reduce Model Size
d_model = 64  # Decrease the model dimensionality
num_heads = 2  # Decrease the number of attention heads
num_layers = 2  # Decrease the number of layers
d_ff = 512  # Decrease the size of the feed-forward layers
max_seq_length = max(train_dataset.max_len, val_dataset.max_len, test_dataset.max_len)  # Maximum sequence length
dropout = 0.1  # Dropout probability
num_epochs=10  # Number of epochs
# Instantiate the Transformer model
transformer_model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, device = device)

In [24]:
optimizer=torch.optim.Adam(transformer_model.parameters(),lr=0.001)
criterion=nn.CrossEntropyLoss()

In [25]:
print(transformer_model)

Transformer(
  (encoder_embedding): Embedding(7500, 64)
  (decoder_embedding): Embedding(7500, 64)
  (positional_encoding): PositionalEncoding()
  (fc): Linear(in_features=64, out_features=7500, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (encoder_layers): ModuleList(
    (0-1): 2 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=64, out_features=64, bias=True)
        (W_k): Linear(in_features=64, out_features=64, bias=True)
        (W_v): Linear(in_features=64, out_features=64, bias=True)
        (W_o): Linear(in_features=64, out_features=64, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=64, out_features=512, bias=True)
        (fc2): Linear(in_features=512, out_features=64, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1,

In [26]:
max_seq_length

66

In [27]:
from tqdm import tqdm
import torch
import torch.nn as nn

# Early stopping parameters
patience = 3  # Number of epochs to wait for improvement before stopping
best_val_loss = float('inf')
epochs_without_improvement = 0
# Training loop
for epoch in range(num_epochs):
    transformer_model.train()  # Set the model to training mode
    total_loss = 0
    
    # Create a progress bar
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    
    # Iterate through batches
    for batch_idx, (src, tgt) in progress_bar:
        src, tgt = src.to(device), tgt.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = transformer_model(src, tgt[:, :-1])  # Exclude the <eos> token from input      
        # Flatten the output and target tensors to compute loss
        output_flat = output.view(-1, output.size(-1))
        tgt_flat = tgt[:, 1:].contiguous().view(-1)  # Exclude the <bos> token from target
        
        # Calculate loss
        loss = criterion(output_flat, tgt_flat)
        
        # Backward pass
        loss.backward()
        
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(transformer_model.parameters(), max_norm=1)
        
        # Update parameters
        optimizer.step()
        
        # Add batch loss to total loss
        total_loss += loss.item()
        
        # Update progress bar description
        progress_bar.set_postfix({"Loss": loss.item()})
    
    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
    
    # Validation  transformer_model.eval()  # Set the model to evaluation mode
    val_loss = 0
    
    with torch.no_grad():
        # Create a progress bar for validation
        val_progress_bar = tqdm(enumerate(val_loader), total=len(val_loader), desc="Validation", unit="batch")
        
        for batch_idx, (src, tgt) in val_progress_bar:
            src, tgt = src.to(device), tgt.to(device)
            
            # Forward pass
            output = transformer_model(src, tgt[:, :-1])  # Exclude the <eos> token from input
            
            # Flatten the output and target tensors to compute loss
            output_flat = output.view(-1, output.size(-1))  
            tgt_flat = tgt[:, 1:].contiguous().view(-1)  # Exclude the <bos> token from target
            
            # Calculate loss
            loss = criterion(output_flat, tgt_flat)
            
            # Add batch loss to total loss
            val_loss += loss.item()
             
            # Update progress bar description
            val_progress_bar.set_postfix({"Validation Loss": loss.item()})

    # Calculate average validation loss
    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")
    
    # Early stopping logic
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_without_improvement = 0  # Reset the counter
        # Save the model
        torch.save(transformer_model.state_dict(), 'transformer_model.pth')
        print(f"Model saved at epoch {epoch+1}")
    else:
        epochs_without_improvement += 1
    
    # Check for early stopping
    if epochs_without_improvement >= patience:
        print(f"Early stopping at epoch {epoch+1}. Validation loss did not improve.")
        break


Epoch 1/10:   0%|          | 1/11599 [00:00<10:41, 18.07batch/s, Loss=8.62]


Epoch 1/10, Average Loss: 0.0007


Validation:   0%|          | 1/1450 [00:00<00:21, 68.91batch/s, Validation Loss=7.02]


Validation Loss: 0.0048
Model saved at epoch 1


Epoch 2/10:   0%|          | 1/11599 [00:00<07:33, 25.58batch/s, Loss=6.97]


Epoch 2/10, Average Loss: 0.0006


Validation:   0%|          | 1/1450 [00:00<00:16, 87.95batch/s, Validation Loss=6.13]


Validation Loss: 0.0042
Model saved at epoch 2


Epoch 3/10:   0%|          | 1/11599 [00:00<08:58, 21.54batch/s, Loss=6.46]


Epoch 3/10, Average Loss: 0.0006


Validation:   0%|          | 1/1450 [00:00<00:14, 101.02batch/s, Validation Loss=5.7]


Validation Loss: 0.0039
Model saved at epoch 3


Epoch 4/10:   0%|          | 1/11599 [00:00<08:22, 23.10batch/s, Loss=5.95]


Epoch 4/10, Average Loss: 0.0005


Validation:   0%|          | 1/1450 [00:00<00:14, 97.60batch/s, Validation Loss=5.45]


Validation Loss: 0.0038
Model saved at epoch 4


Epoch 5/10:   0%|          | 1/11599 [00:00<07:32, 25.66batch/s, Loss=5.37]


Epoch 5/10, Average Loss: 0.0005


Validation:   0%|          | 1/1450 [00:00<00:15, 93.49batch/s, Validation Loss=5.28]


Validation Loss: 0.0036
Model saved at epoch 5


Epoch 6/10:   0%|          | 1/11599 [00:00<07:35, 25.45batch/s, Loss=5.34]


Epoch 6/10, Average Loss: 0.0005


Validation:   0%|          | 1/1450 [00:00<00:15, 90.98batch/s, Validation Loss=5.16]


Validation Loss: 0.0036
Model saved at epoch 6


Epoch 7/10:   0%|          | 1/11599 [00:00<05:32, 34.83batch/s, Loss=5.04]


Epoch 7/10, Average Loss: 0.0004


Validation:   0%|          | 1/1450 [00:00<00:16, 88.66batch/s, Validation Loss=5.07]


Validation Loss: 0.0035
Model saved at epoch 7


Epoch 8/10:   0%|          | 1/11599 [00:00<07:19, 26.37batch/s, Loss=5.84]


Epoch 8/10, Average Loss: 0.0005


Validation:   0%|          | 1/1450 [00:00<00:21, 68.60batch/s, Validation Loss=4.95]


Validation Loss: 0.0034
Model saved at epoch 8


Epoch 9/10:   0%|          | 1/11599 [00:00<06:18, 30.65batch/s, Loss=6.14]


Epoch 9/10, Average Loss: 0.0005


Validation:   0%|          | 1/1450 [00:00<00:27, 52.64batch/s, Validation Loss=4.86]


Validation Loss: 0.0034
Model saved at epoch 9


Epoch 10/10:   0%|          | 1/11599 [00:00<07:13, 26.75batch/s, Loss=5.12]


Epoch 10/10, Average Loss: 0.0004


Validation:   0%|          | 1/1450 [00:00<00:06, 211.73batch/s, Validation Loss=4.76]

Validation Loss: 0.0033
Model saved at epoch 10





In [28]:
from rouge_score import rouge_scorer
from tqdm import tqdm
import torch
import torch.nn as nn

# Early stopping parameters
patience = 3  # Number of epochs to wait for improvement before stopping
best_val_loss = float('inf')
epochs_without_improvement = 0


# Create a ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Training loop
for epoch in range(num_epochs):
    transformer_model.train()  # Set the model to training mode
    total_loss = 0
    
    # Create a progress bar for training
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    
    # Iterate through batches for training
    for batch_idx, (src, tgt) in progress_bar:
        src, tgt = src.to(device), tgt.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = transformer_model(src, tgt[:, :-1])  # Exclude the <eos> token from input
        # Flatten the output and target tensors to compute loss
        output_flat = output.view(-1, output.size(-1))
        tgt_flat = tgt[:, 1:].contiguous().view(-1)  # Exclude the <bos> token from target
        
        # Calculate loss
        loss = criterion(output_flat, tgt_flat)
        
        # Backward pass
        loss.backward()
        
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(transformer_model.parameters(), max_norm=1)
        
        # Update parameters
        optimizer.step()
        
        # Add batch loss to total loss
        total_loss += loss.item()
        
        # Update progress bar description
        progress_bar.set_postfix({"Loss": loss.item()})
    
    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
    
    # Validation
    transformer_model.eval()  # Set the model to evaluation mode
    val_loss = 0
    val_rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    with torch.no_grad():
        # Create a progress bar for validation
        val_progress_bar = tqdm(enumerate(val_loader), total=len(val_loader), desc="Validation", unit="batch")
        
        for batch_idx, (src, tgt) in val_progress_bar:
            src, tgt = src.to(device), tgt.to(device)
            
            # Forward pass
            output = transformer_model(src, tgt[:, :-1])  # Exclude the <eos> token from input
            
            # Flatten the output and target tensors to compute loss
            output_flat = output.view(-1, output.size(-1))  
            tgt_flat = tgt[:, 1:].contiguous().view(-1)  # Exclude the <bos> token from target
            
            # Calculate loss
            loss = criterion(output_flat, tgt_flat)
            
            # Add batch loss to total validation loss
            val_loss += loss.item()
            
            # Decode the predictions to text (argmax over the vocab dimension)
            pred_tokens = output.argmax(dim=-1)  # Taking the predicted tokens (removes vocab dimension)
            
            # Loop through each pair of predicted and true targets for ROUGE calculation
            for pred, true in zip(pred_tokens, tgt):
                pred_text = ger_tokenizer.decode(pred.tolist())
                true_text = ger_tokenizer.decode(true.tolist())
                
                # Compute ROUGE score for each pair and append the F-measure values
                score = scorer.score(true_text, pred_text)
                for key in val_rouge_scores:
                    val_rouge_scores[key].append(score[key].fmeasure)

            # Update progress bar description
            val_progress_bar.set_postfix({"Validation Loss": loss.item()})

    # Calculate average validation loss
    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

    # Calculate the average ROUGE scores
    avg_rouge_scores = {key: sum(values) / len(values) for key, values in val_rouge_scores.items()}
    print(f"Validation ROUGE Scores: {avg_rouge_scores}")
    
    # Early stopping logic
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_without_improvement = 0  # Reset the counter
        # Save the model
        torch.save(transformer_model.state_dict(), 'transformer_model.pth')
        print(f"Model saved at epoch {epoch+1}")
    else:
        epochs_without_improvement += 1
    
    # Check for early stopping
    if epochs_without_improvement >= patience:
        print(f"Early stopping at epoch {epoch+1}. Validation loss did not improve.")
        break


Epoch 1/10: 100%|██████████| 11599/11599 [07:03<00:00, 27.38batch/s, Loss=0.881]


Epoch 1/10, Average Loss: 1.0510


Validation: 100%|██████████| 1450/1450 [00:14<00:00, 103.38batch/s, Validation Loss=0.906]


Validation Loss: 0.9542
Validation ROUGE Scores: {'rouge1': 0.43283835715091595, 'rouge2': 0.19386879035341922, 'rougeL': 0.4162730873912906}
Model saved at epoch 1


Epoch 2/10: 100%|██████████| 11599/11599 [07:20<00:00, 26.34batch/s, Loss=0.701] 


Epoch 2/10, Average Loss: 0.8318


Validation: 100%|██████████| 1450/1450 [00:13<00:00, 105.22batch/s, Validation Loss=0.785]


Validation Loss: 0.8407
Validation ROUGE Scores: {'rouge1': 0.5023131110557396, 'rouge2': 0.2570274930347473, 'rougeL': 0.4851970226480362}
Model saved at epoch 2


Epoch 3/10: 100%|██████████| 11599/11599 [07:20<00:00, 26.32batch/s, Loss=0.802] 


Epoch 3/10, Average Loss: 0.7503


Validation: 100%|██████████| 1450/1450 [00:14<00:00, 100.94batch/s, Validation Loss=0.729]


Validation Loss: 0.7880
Validation ROUGE Scores: {'rouge1': 0.5347638125299945, 'rouge2': 0.28935758130210404, 'rougeL': 0.5160682260691485}
Model saved at epoch 3


Epoch 4/10: 100%|██████████| 11599/11599 [07:26<00:00, 25.99batch/s, Loss=0.156]


Epoch 4/10, Average Loss: 0.6993


Validation: 100%|██████████| 1450/1450 [00:14<00:00, 101.25batch/s, Validation Loss=0.718]


Validation Loss: 0.7479
Validation ROUGE Scores: {'rouge1': 0.5607894043232357, 'rouge2': 0.32053016506502635, 'rougeL': 0.5438058675530372}
Model saved at epoch 4


Epoch 5/10: 100%|██████████| 11599/11599 [07:39<00:00, 25.23batch/s, Loss=0.249] 


Epoch 5/10, Average Loss: 0.6625


Validation: 100%|██████████| 1450/1450 [00:13<00:00, 107.01batch/s, Validation Loss=0.637] 


Validation Loss: 0.7135
Validation ROUGE Scores: {'rouge1': 0.5769357531802678, 'rouge2': 0.33625503623856834, 'rougeL': 0.559124122756387}
Model saved at epoch 5


Epoch 6/10: 100%|██████████| 11599/11599 [07:46<00:00, 24.86batch/s, Loss=1.1]   


Epoch 6/10, Average Loss: 0.6332


Validation: 100%|██████████| 1450/1450 [00:14<00:00, 102.57batch/s, Validation Loss=0.759] 


Validation Loss: 0.6885
Validation ROUGE Scores: {'rouge1': 0.5894155829211623, 'rouge2': 0.3569996679065852, 'rougeL': 0.5734363165611484}
Model saved at epoch 6


Epoch 7/10: 100%|██████████| 11599/11599 [07:35<00:00, 25.47batch/s, Loss=0.897] 


Epoch 7/10, Average Loss: 0.6103


Validation: 100%|██████████| 1450/1450 [00:14<00:00, 97.45batch/s, Validation Loss=0.673]  


Validation Loss: 0.6725
Validation ROUGE Scores: {'rouge1': 0.5962022664634847, 'rouge2': 0.362150133390866, 'rougeL': 0.5807996008775139}
Model saved at epoch 7


Epoch 8/10: 100%|██████████| 11599/11599 [21:46<00:00,  8.88batch/s, Loss=0.606]   


Epoch 8/10, Average Loss: 0.5922


Validation: 100%|██████████| 1450/1450 [00:14<00:00, 101.19batch/s, Validation Loss=0.643] 


Validation Loss: 0.6596
Validation ROUGE Scores: {'rouge1': 0.605010966631962, 'rouge2': 0.3746338651756154, 'rougeL': 0.5899880129688385}
Model saved at epoch 8


Epoch 9/10: 100%|██████████| 11599/11599 [09:41<00:00, 19.94batch/s, Loss=0.724]  


Epoch 9/10, Average Loss: 0.5757


Validation: 100%|██████████| 1450/1450 [00:14<00:00, 102.56batch/s, Validation Loss=0.695] 


Validation Loss: 0.6558
Validation ROUGE Scores: {'rouge1': 0.6099110882217844, 'rouge2': 0.3763702754133648, 'rougeL': 0.5932270568663334}
Model saved at epoch 9


Epoch 10/10: 100%|██████████| 11599/11599 [11:07<00:00, 17.38batch/s, Loss=0.487]  


Epoch 10/10, Average Loss: 0.5644


Validation: 100%|██████████| 1450/1450 [00:14<00:00, 103.08batch/s, Validation Loss=0.624] 

Validation Loss: 0.6454
Validation ROUGE Scores: {'rouge1': 0.6123742827673571, 'rouge2': 0.3820343653436769, 'rougeL': 0.5971016769158085}
Model saved at epoch 10





# Evaluation Loop

In [30]:
# Testing
from rouge_score import rouge_scorer
from tqdm import tqdm
import torch
import torch.nn as nn

transformer_model.eval()  # Set the model to evaluation mode
test_loss = 0
test_rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
# Create a ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
with torch.no_grad():
        # Create a progress bar for validation
        test_progress_bar = tqdm(enumerate(test_loader), total=len(test_loader), desc="Test", unit="batch")
        
        for batch_idx, (src, tgt) in test_progress_bar:
            src, tgt = src.to(device), tgt.to(device)
            
            # Forward pass
            output = transformer_model(src, tgt[:, :-1])  # Exclude the <eos> token from input
            
            # Flatten the output and target tensors to compute loss
            output_flat = output.view(-1, output.size(-1))  
            tgt_flat = tgt[:, 1:].contiguous().view(-1)  # Exclude the <bos> token from target
            
            # Calculate loss
            loss = criterion(output_flat, tgt_flat)
            
            # Add batch loss to total validation loss
            test_loss += loss.item()
            
            # Decode the predictions to text (argmax over the vocab dimension)
            pred_tokens = output.argmax(dim=-1)  # Taking the predicted tokens (removes vocab dimension)
            
            # Loop through each pair of predicted and true targets for ROUGE calculation
            for pred, true in zip(pred_tokens, tgt):
                pred_text = ger_tokenizer.decode(pred.tolist())
                true_text = ger_tokenizer.decode(true.tolist())
                
                # Compute ROUGE score for each pair and append the F-measure values
                score = scorer.score(true_text, pred_text)
                for key in test_rouge_scores:
                    test_rouge_scores[key].append(score[key].fmeasure)

            # Update progress bar description
            test_progress_bar.set_postfix({"Test Loss": loss.item()})

    # Calculate average validation loss
avg_loss = test_loss / len(test_loader)
print(f"Test Loss: {avg_loss:.4f}")

    # Calculate the average ROUGE scores
avg_rouge_scores = {key: sum(values) / len(values) for key, values in test_rouge_scores
                    .items()}
print(f"Test ROUGE Scores: {avg_rouge_scores}")
    

Test: 100%|██████████| 500/500 [00:04<00:00, 108.20batch/s, Test Loss=0.216]

Test Loss: 0.7769
Test ROUGE Scores: {'rouge1': 0.6174154518047238, 'rouge2': 0.38755095282171126, 'rougeL': 0.6033373757267394}



