# Machine Translation using Seq2Seq Model with English-German dataset

## Loading the Data and Pre-processing

In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from torch.nn import LayerNorm
from itertools import chain
import numpy as np 
import spacy
import random 
import pandas as pd 
import matplotlib.pyplot as plt

In [44]:
df=pd.read_csv("D:/College/5th Semester/NLP/Assignmnet 1/translation_train.csv")
df.head()

Unnamed: 0,english,german
0,"Two young, White males are outside near many b...",Zwei junge weiße Männer sind im Freien in der ...
1,Several men in hard hats are operating a giant...,Mehrere Männer mit Schutzhelmen bedienen ein A...
2,A little girl climbing into a wooden playhouse.,Ein kleines Mädchen klettert in ein Spielhaus ...
3,A man in a blue shirt is standing on a ladder ...,Ein Mann in einem blauen Hemd steht auf einer ...
4,Two men are at the stove preparing food.,Zwei Männer stehen am Herd und bereiten Essen zu.


In [45]:
df_test=pd.read_csv("D:/College/5th Semester/NLP/Assignmnet 1/translation_test.csv")
df_test.head()

Unnamed: 0,english,german
0,A man in an orange hat starring at something.,"Ein Mann mit einem orangefarbenen Hut, der etw..."
1,A Boston Terrier is running on lush green gras...,Ein Boston Terrier läuft über saftig-grünes Gr...
2,A girl in karate uniform breaking a stick with...,Ein Mädchen in einem Karateanzug bricht einen ...
3,Five people wearing winter jackets and helmets...,Fünf Leute in Winterjacken und mit Helmen steh...
4,People are fixing the roof of a house.,Leute Reparieren das Dach eines Hauses.


In [46]:
df.duplicated().sum()

3

In [47]:
df.drop_duplicates(inplace=True)

In [48]:
df.dropna(inplace=True)

In [49]:
df_test.duplicated().sum()

0

In [50]:
df_test.dropna(inplace=True)

In [51]:
BATCH_SIZE=64
english=df['english'].tolist()
german=df['german'].tolist()

In [52]:
MAX_SEQ_LEN=20 # For computation

# Data Pre-Processing

In [53]:
from collections import Counter
from itertools import chain
def create_tokenizer(text_corpus, vocab_limit=10000):
    # Create a counter to count the word frequencies in the corpus
    word_counter = Counter(chain.from_iterable(sentence.split() for sentence in text_corpus))
    
    # Build a vocabulary with a limit
    vocabulary = ["<pad>", "<sos>", "<eos>", "<unk>"] + [word for word, _ in word_counter.most_common(vocab_limit - 4)]
    
    # Build mappings for word-to-index and index-to-word
    word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
    index_to_word = {idx: word for word, idx in word_to_index.items()}
    
    # Tokenize function will return a list of integers (the word indices)
    def tokenize(sentence):
        # Use get() to prevent key errors (if a word is not in the vocabulary, it maps to <unk>)
        return [word_to_index.get(word, word_to_index["<unk>"]) for word in sentence.split()]
    
    return tokenize, word_to_index, index_to_word


In [54]:
import torch
from torch.utils.data import Dataset

class MTDataset(Dataset):
    def __init__(self, input_texts, output_texts, input_tokenizer, output_tokenizer, sequence_limit):
        # Use list comprehension to process the input and output sequences
        self.input_sequences = [input_tokenizer(text)[:sequence_limit] for text in input_texts]
        self.output_sequences = [output_tokenizer(text)[:sequence_limit] for text in output_texts]

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, index):
        input_sequence = torch.tensor(self.input_sequences[index], dtype=torch.long)
        output_sequence = torch.tensor(self.output_sequences[index], dtype=torch.long)
        return input_sequence, output_sequence


In [55]:
from torch.nn.utils.rnn import pad_sequence

def batch_collate_fn(batch):
    input_batch, output_batch = zip(*batch) 
    # Pad the input and output sequences
    input_padded = pad_sequence(input_batch, batch_first=True, padding_value=0)
    output_padded = pad_sequence(output_batch, batch_first=True, padding_value=0)
    return input_padded, output_padded


In [56]:
from sklearn.model_selection import train_test_split
training_input,validation_input,training_output,validation_output=train_test_split(english,german,test_size=0.2,random_state=42)


In [57]:
testing_input=df_test['english'].tolist()
testing_output=df_test['german'].tolist()


In [58]:
eng_tokenizer,eng_word2idx,eng_idx2word=create_tokenizer(training_input)
ger_tokenizer,ger_word2idx,gr_idx2word=create_tokenizer(training_output)

In [59]:
train_dataset=MTDataset(training_input,training_output,eng_tokenizer,ger_tokenizer,MAX_SEQ_LEN)
val_dataset=MTDataset(validation_input,validation_output,eng_tokenizer,ger_tokenizer,MAX_SEQ_LEN)
test_dataset=MTDataset(testing_input,testing_output,eng_tokenizer,ger_tokenizer,MAX_SEQ_LEN)


In [60]:
train_loader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=batch_collate_fn)
val_loader=DataLoader(val_dataset,batch_size=BATCH_SIZE,shuffle=False,collate_fn=batch_collate_fn)
test_loader=DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=False,collate_fn=batch_collate_fn)

## Embedding Layer

In [61]:
from gensim.models import Word2Vec
import numpy as np

embed_dim = 256  
vocab = len(eng_word2idx) 

# Training the Word2Vec model
model_2 = Word2Vec(sentences=[sentence.split() for sentence in df['english'].tolist()], 
                   vector_size=embed_dim, window=5, min_count=1, sg=1)

# Initialize embedding matrix with random values
embedding_matrix = np.random.uniform(-0.05, 0.05, (vocab, embed_dim))

# Update the embedding matrix with pre-trained Word2Vec vectors
for word, idx in eng_word2idx.items():
    if word in model_2.wv:
        embedding_matrix[idx] = model_2.wv[word]


In [62]:
embedding_matrix.shape

(10000, 256)

## SEQ2SEQ Model

In [63]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, num_layers,embedding_matrix):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.tensor(embedding_matrix,dtype=torch.float32)) 
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, bidirectional=True, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

In [64]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, num_layers,embedding_matrix):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, embed_dim)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.tensor(embedding_matrix,dtype=torch.float32)) 
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)  # Add time-step dimension
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        predictions = self.fc(outputs.squeeze(1))
        return predictions, hidden, cell

In [65]:
class Early_Stopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.early_stop = False
        self.counter = 0
        self.best_model_state = None

    def __call__(self, val_loss, model):
        score = -val_loss  
        if self.best_score is None:
            self.best_score = score
            self.best_model_state = model.state_dict()  
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.best_model_state = model.state_dict()  
            self.counter = 0

    def load_best_model(self, model):
        model.load_state_dict(self.best_model_state) 

In [66]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.num_layers = decoder.lstm.num_layers

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(src.device)

        # Encoder forward pass
        hidden, cell = self.encoder(src)
        
        # Use the last forward states from the encoder
        hidden = hidden[-2].unsqueeze(0).repeat(self.num_layers, 1, 1)  # Repeat for num_layers
        cell = cell[-2].unsqueeze(0).repeat(self.num_layers, 1, 1)      # Repeat for num_layers

        # Decoder forward pass
        x = trg[:, 0]  # Start token (<sos>)
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[:, t, :] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            x = trg[:, t] if teacher_force else output.argmax(1)

        return outputs


## Model Training

In [67]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [68]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [69]:
input_dim=len(eng_word2idx)
output_dim=len(ger_word2idx)
embed_dim=256
hidden_dim=512
num_layers=2
num_epochs=10

In [70]:
encoder=Encoder(input_dim,embed_dim,hidden_dim,num_layers,embedding_matrix)
decoder=Decoder(output_dim,embed_dim,hidden_dim,num_layers,embedding_matrix)
model=Seq2Seq(encoder,decoder).to(device)
optimizer=optim.Adam(model.parameters(),lr=0.001)
criterion=nn.CrossEntropyLoss(ignore_index=0)

In [71]:
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for src, trg in train_loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()

        output = model(src, trg)
        output_dim = output.shape[-1]

        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        optimizer.step()
        epoch_loss += loss.item()
        avg_loss=epoch_loss/len(train_loader)
    print(f"Epoch {epoch+1} epoch loss: {avg_loss:.4f}")

Epoch 1 epoch loss: 6.0927
Epoch 2 epoch loss: 5.6066
Epoch 3 epoch loss: 5.4055
Epoch 4 epoch loss: 5.1414
Epoch 5 epoch loss: 4.8698
Epoch 6 epoch loss: 4.6085
Epoch 7 epoch loss: 4.3740
Epoch 8 epoch loss: 4.1734
Epoch 9 epoch loss: 3.9913
Epoch 10 epoch loss: 3.8138


## Model Evaulation

In [72]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
model.eval()  
predicted_sentences, reference_sentences = [], []  # Lists to store sentences
bleu_results = []

with torch.no_grad():  
    for src, trg in test_loader:
        src, trg = src.to(device), trg.to(device)
        model_output = model(src, trg, 0) 
        predicted_indices = model_output.argmax(2)  
        for i in range(len(predicted_indices)): 
            predicted_words = [gr_idx2word.get(idx.item(), '<unk>') for idx in predicted_indices[i]]
            reference_words = [gr_idx2word.get(idx.item(), '<unk>') for idx in trg[i]]
            predicted_words = [word for word in predicted_words if word not in ("<pad>", "<sos>", "<eos>")]
            reference_words = [word for word in reference_words if word not in ("<pad>", "<sos>", "<eos>")]
            predicted_sentences.append(predicted_words)
            reference_sentences.append([reference_words])  
            bleu_results.append(sentence_bleu([reference_words], predicted_words))
avg_bleu_score = sum(bleu_results) / len(bleu_results)
print(f"The average sentence-level BLEU score is {avg_bleu_score:.4f}")
corpus_bleu_score = corpus_bleu(reference_sentences, predicted_sentences)
print(f"The corpus-level BLEU score is {corpus_bleu_score:.4f}")


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


The average sentence-level BLEU score is 0.0150
The corpus-level BLEU score is 0.0395


### Now let's use the GPT-2 transformer Library

In [73]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [74]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import Dataset, DataLoader
import torch

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to prepare dataset
def prepare_gpt2_dataset(df, sep_token="<|SEP|>"):
    formatted_data = []
    for _, row in df.iterrows():
        src = row["english"]
        trg = row["german"]
        formatted_data.append(f"{src} {sep_token} {trg}")
    return formatted_data

# Load tokenizer and model
tokenizer_gpt = GPT2Tokenizer.from_pretrained("gpt2")
model_gpt = GPT2LMHeadModel.from_pretrained("gpt2")

# Add special tokens
special_tokens = {"sep_token": "<|SEP|>", "pad_token": "<|PAD|>"}
tokenizer_gpt.add_special_tokens(special_tokens)
tokenizer_gpt.pad_token = "<|PAD|>"
model_gpt.resize_token_embeddings(len(tokenizer_gpt))  # Update model embeddings
model_gpt.config.pad_token_id = tokenizer_gpt.pad_token_id


# Prepare dataset
gpt2_texts = prepare_gpt2_dataset(df)

In [83]:
# Custom dataset class
class GPT2Dataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)
        return input_ids, attention_mask

# Create dataset and dataloader
max_length = 128
batch_size = 16
dataset = GPT2Dataset(gpt2_texts, tokenizer_gpt, max_length=max_length)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [76]:
# Move model to device and define optimizer
model_gpt.to(device)
optimizer = AdamW(model_gpt.parameters(), lr=5e-5)
# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model_gpt.train()
    epoch_loss = 0   
    for input_ids, attention_mask in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        # Forward pass
        outputs = model_gpt(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        epoch_loss += loss.item()
        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Print epoch loss
    model_gpt.save_pretrained("gpt2_finetuned_checkpoint")
    tokenizer_gpt.save_pretrained("gpt2_finetuned_checkpoint")
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_loader):.4f}")



Epoch 1, Loss: 1.0662
Epoch 2, Loss: 0.6081
Epoch 3, Loss: 0.5361


In [77]:
gpt2texs_test=prepare_gpt2_dataset(df_test)
dataset_test=GPT2Dataset(gpt2texs_test,tokenizer_gpt,max_length=max_length)
test_loader=DataLoader(dataset_test,batch_size=batch_size)


In [78]:
src[src >= tokenizer_gpt.vocab_size] = tokenizer_gpt.pad_token_id
src[src < 0] = tokenizer_gpt.pad_token_id


In [79]:
from rouge_score import rouge_scorer

# Function to generate text using the trained model
def generate_translation(model, tokenizer, src_sentence, max_length=128):
    model.eval()  # Set model to evaluation mode
    inputs = tokenizer.encode(src_sentence, return_tensors="pt", truncation=True, max_length=max_length)
    
    # Create attention mask for padding (1 for non-padding, 0 for padding)
    attention_mask = (inputs != tokenizer.pad_token_id).long()
    
    # Ensure that pad_token_id is properly set during generation
    generated_ids = model.generate(
        input_ids=inputs.to(device),
        attention_mask=attention_mask.to(device),
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id  # Explicitly set the pad_token_id
    )
    
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# Function to evaluate ROUGE scores
def evaluate_with_rouge(model, tokenizer, test_df, max_length=128):
    model.eval()
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    
    total_rouge1 = 0
    total_rouge2 = 0
    total_rougeL = 0
    num_samples = len(test_df)
    
    # Generate predictions and compare to target
    with torch.no_grad():
        for i, row in test_df.iterrows():
            src_sentence = row["english"]
            trg_sentence = row["german"]
            
            # Generate predicted translation
            pred_translation = generate_translation(model, tokenizer, src_sentence, max_length)
            
            # Compute ROUGE scores between the predicted and target translation
            scores = scorer.score(trg_sentence, pred_translation)
            
            total_rouge1 += scores["rouge1"].fmeasure
            total_rouge2 += scores["rouge2"].fmeasure
            total_rougeL += scores["rougeL"].fmeasure

    # Calculate average ROUGE scores
    avg_rouge1 = total_rouge1 / num_samples
    avg_rouge2 = total_rouge2 / num_samples
    avg_rougeL = total_rougeL / num_samples
    
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")
    
    return avg_rouge1, avg_rouge2, avg_rougeL

# Evaluate using ROUGE on test data
evaluate_with_rouge(model_gpt, tokenizer_gpt, df_test)

Average ROUGE-1: 0.2352
Average ROUGE-2: 0.1188
Average ROUGE-L: 0.1987


(0.23523840144164126, 0.11882599355287717, 0.198748926828044)

In [80]:
def translate_sentence(model, tokenizer, input_sentence, max_length=128, max_new_tokens=50):
    model.eval()
    
    # Tokenize input
    inputs = tokenizer(
        input_sentence, 
        return_tensors="pt", 
        padding="max_length", 
        truncation=True, 
        max_length=max_length
    ).to(device)
    
    # Move tensors to the device
    inputs = {key: value for key, value in inputs.items()}
    model.to(device)
    # Generate output
    outputs = model.generate(
        inputs['input_ids'].to(device), 
        attention_mask=inputs['attention_mask'], 
        max_length=max_length + max_new_tokens,  # Extend max_length
        max_new_tokens=max_new_tokens,          # Limit number of new tokens
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Decode the result
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [81]:
sentence = "Hello my name is Habiba"

In [82]:
sentence_translated=translate_sentence(model_gpt,tokenizer_gpt,sentence)
sentence_translated

Both `max_new_tokens` (=50) and `max_length`(=178) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'Hello my name is Habiba Ein Hände ist Habiba.'