In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset,DataLoader,random_split
import kagglehub
import re
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
import random
import pickle
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

In [None]:
path = kagglehub.dataset_download("devicharith/language-translation-englishfrench")
print("Path to dataset files:", path)

In [None]:
dataset = pd.read_csv("/kaggle/input/language-translation-englishfrench/eng_-french.csv")

In [None]:
dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataset):
        self.data = dataset  
        self.eng_wtoi, self.eng_itow = self.build_vocab(self.data.iloc[:, 0])  # English column
        self.fr_wtoi, self.fr_itow = self.build_vocab(self.data.iloc[:, 1])   # French column

    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r"([^\w\s])", r" \1 ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return ["<sos>"] + text.split() + ["<eos>"]

    def build_vocab(self, series, min_freq=1):
        counter = Counter()
        for text in series:
            tokens = self.preprocess_text(text)
            counter.update(tokens)
        vocab = ["<pad>","<unk>"]
        vocab += list(counter.keys())
        wtoi = {word: idx for idx, word in enumerate(vocab)}
        itow = {idx: word for word, idx in wtoi.items()}
        return wtoi, itow

    def __getitem__(self, index):
        eng_sentence = self.preprocess_text(self.data.iloc[index][0])
        fr_sentence = self.preprocess_text(self.data.iloc[index][1])

        eng_indices = [self.eng_wtoi.get(token, self.eng_wtoi["<unk>"]) for token in eng_sentence]
        fr_indices = [self.fr_wtoi.get(token, self.fr_wtoi["<unk>"]) for token in fr_sentence]

        return torch.tensor(eng_indices), torch.tensor(fr_indices)

    def __len__(self):
        return len(self.data)


def collate_function(batch, pad_idx_eng, pad_idx_fr):
    sources, targets = zip(*batch)
    padded_sources = pad_sequence(sources, batch_first=True, padding_value=pad_idx_eng)
    padded_targets = pad_sequence(targets, batch_first=True, padding_value=pad_idx_fr)

    return padded_sources, padded_targets


In [None]:
custom_dataset = CustomDataset(dataset)
PAD_IDX_ENG = custom_dataset.eng_wtoi["<pad>"]
PAD_IDX_FR = custom_dataset.fr_wtoi["<pad>"]

In [None]:
train_dataset,val_dataset,test_dataset = random_split(custom_dataset,[0.8,0.1,0.1])

In [None]:
print(len(train_dataset)+len(val_dataset)+len(test_dataset),len(dataset))

In [None]:
batch_size = 128
train_dataloader = DataLoader(train_dataset,batch_size = batch_size, collate_fn=lambda batch: collate_function(batch, PAD_IDX_ENG, PAD_IDX_FR))
val_dataloader = DataLoader(val_dataset,batch_size = batch_size, collate_fn=lambda batch: collate_function(batch, PAD_IDX_ENG, PAD_IDX_FR))
test_dataloader = DataLoader(test_dataset,batch_size = batch_size, collate_fn=lambda batch: collate_function(batch, PAD_IDX_ENG, PAD_IDX_FR))

In [None]:
class Encoder(nn.Module):
    def __init__(self,emb_dim,hidden_size,num_layers):
        super().__init__()
        self.lstm = nn.LSTM(input_size=emb_dim,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            bias=True,
                            batch_first=True)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(self.device)
        
    def forward(self,batch):
        batch=batch.to(self.device)
        _,(hn,cn) = self.lstm(batch)
        return hn,cn

class Decoder(nn.Module):
    def __init__(self,emb_size,hidden_size,output_size,num_layers,batch_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size=emb_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            bias=True,
                            batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(self.device)
        
    def forward(self, input_seq, hidden, cell):
        input_seq = input_seq.to(self.device)
        hidden, cell = hidden.to(self.device), cell.to(self.device)
        lstm_out, (hidden, cell) = self.lstm(input_seq, (hidden, cell))
        output = self.fc(lstm_out)
        return output, hidden, cell

class seq2seq(nn.Module):
    def __init__(self,en_vocab_size,fr_vocab_size,emb_dim,encoder,decoder,learning_rate,num_epochs,TRG_PAD_IDX):
        super().__init__()
        self.en_vocab_size = en_vocab_size
        self.fr_vocab_size = fr_vocab_size
        self.en_embedding = nn.Embedding(en_vocab_size,emb_dim)
        self.fr_embedding = nn.Embedding(fr_vocab_size,emb_dim)
        self.encoder = encoder
        self.decoder = decoder
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.optimizer = torch.optim.Adam(self.parameters(),lr=learning_rate)
        self.loss_fn = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
        self.num_epochs = num_epochs
        self.teacher_forcing_ratio = 0.75
        self.to(self.device)
    
    def fit(self, train_data, val_data=None):
        for epoch in range(self.num_epochs):
            self.train()
            train_loss = 0.0
            prev_val = 0.0
            val_inc_count = 0
            for sources, targets in train_data:
                self.optimizer.zero_grad()

                sources, targets = sources.to(self.device), targets.to(self.device)
                source_batch = self.en_embedding(sources)
                target_batch = self.fr_embedding(targets)
    
                hn, cn = self.encoder(source_batch)

                target_batch_starting = target_batch[:,0,:]
                seq_length = targets.shape[1]
                inputs = target_batch_starting
                generated_outputs = []
                inputs = inputs.unsqueeze(1)
                for i in range(1,seq_length):
                    decoder_lstm_output , hn , cn = self.decoder.forward(inputs,hn,cn)
                    use_teacher = random.random() < self.teacher_forcing_ratio
                    if use_teacher:
                        inputs = target_batch[:, i, :]  
                        inputs = inputs.unsqueeze(1)
                    else:
                        inputs = decoder_lstm_output.argmax(dim=-1)  
                        inputs = self.fr_embedding(inputs)      
                    
                    generated_outputs.append(decoder_lstm_output)
                
                generated_outputs = torch.stack(generated_outputs)
                generated_outputs = generated_outputs.permute(2,1,0,3).squeeze(0)
                generated_outputs = generated_outputs.contiguous().view(-1,self.fr_vocab_size)
                expected_outputs = targets[:, 1:].contiguous().view(-1)
                
                loss = self.loss_fn(generated_outputs, expected_outputs)

                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()

            val_loss = 0.0
            if val_data:
                self.eval()
                with torch.no_grad():
                    for sources, targets in val_data:
                        sources, targets = sources.to(self.device), targets.to(self.device)
                        source_batch = self.en_embedding(sources)
                        target_batch = self.fr_embedding(targets)
    
                        hn, cn = self.encoder(source_batch)
                        target_batch_starting = target_batch[:,0,:]
                        seq_length = targets.shape[1]
                        inputs = target_batch_starting
                        generated_outputs = []
                        for i in range(1,seq_length):
                            decoder_lstm_output , hn , cn = self.decoder.forward(inputs.unsqueeze(1),hn,cn)
                            inputs = target_batch[:,i,:]
                            generated_outputs.append(decoder_lstm_output)
                        generated_outputs = torch.stack(generated_outputs)
                        generated_outputs = generated_outputs.permute(2,1,0,3).squeeze(0)
                        generated_outputs = generated_outputs.contiguous().view(-1,self.fr_vocab_size)
                        expected_outputs = targets[:, 1:].contiguous().view(-1)
                        
                        loss = self.loss_fn(generated_outputs, expected_outputs)
                        val_loss += loss.item()
            
            train_ppl = math.exp(train_loss)
            if val_data:
                val_ppl = math.exp(val_loss)
                print(f"Epoch {epoch} : Train Loss - {train_loss:.4f} | Val Loss - {val_loss:.4f} | Train PPL - {train_ppl:.4f} | Val PPL - {val_ppl:.4f}")
            else:
                print(f"Epoch {epoch+1} : Train Loss - {train_loss / len(train_data):.4f} | Train PPL - {train_ppl:.4f}")

            if(val_loss / len(val_data)>prev_val):
                val_inc_count += 1 
                if(val_inc_count>=2):
                    break
            else:
                val_inc_count = 0
            prev_val = val_loss / len(val_data)

            model_save_filename = f"/kaggle/working/epoch_{epoch + 1}.pth"
            torch.save(self.state_dict(), model_save_filename)
            print("Model Saved")


In [None]:
emd_dim=512
hidden_size = 1000
num_layers = 2
fr_vocab_size = len(custom_dataset.fr_wtoi)
en_vocab_size = len(custom_dataset.eng_wtoi)
lr = 1e-3
num_epochs = 10

encoder = Encoder(emb_dim=emd_dim, hidden_size=hidden_size, num_layers=num_layers)
decoder = Decoder(emb_size=emd_dim, hidden_size=hidden_size, output_size=fr_vocab_size, num_layers=num_layers, batch_size=batch_size)
model = seq2seq(    
    en_vocab_size=en_vocab_size,
    fr_vocab_size=fr_vocab_size,
    emb_dim=emd_dim,
    encoder=encoder,
    decoder=decoder,
    learning_rate=lr,
    num_epochs=num_epochs,
    TRG_PAD_IDX=PAD_IDX_FR)

In [3]:
model.fit(train_dataloader,val_dataloader)

Epoch 1 : Train Loss - 3.8774 | Val Loss - 2.6443 | Train PPL - 48.2985 | Val PPL - 14.0736
Model Saved
Epoch 2 : Train Loss - 2.5138 | Val Loss - 2.0460 | Train PPL - 12.3518 | Val PPL - 7.7369
Model Saved
Epoch 3 : Train Loss - 1.8909 | Val Loss - 1.6933 | Train PPL - 6.6253 | Val PPL - 5.4374
Model Saved
Epoch 4 : Train Loss - 1.4645 | Val Loss - 1.5378 | Train PPL - 4.3254 | Val PPL - 4.6543
Model Saved
Epoch 5 : Train Loss - 1.1496 | Val Loss - 1.4609 | Train PPL - 3.1569 | Val PPL - 4.3098
Model Saved
Epoch 6 : Train Loss - 0.9311 | Val Loss - 1.4447 | Train PPL - 2.5373 | Val PPL - 4.2406
Model Saved
Epoch 7 : Train Loss - 0.7584 | Val Loss - 1.4396 | Train PPL - 2.1349 | Val PPL - 4.2190
Model Saved
Epoch 8 : Train Loss - 0.6270 | Val Loss - 1.4443 | Train PPL - 1.8720 | Val PPL - 4.2389
Model Saved
Epoch 9 : Train Loss - 0.5968 | Val Loss - 1.4548 | Train PPL - 1.8163 | Val PPL - 4.2836
Model Saved


In [None]:
def sample_with_temperature(logits, temperature=1.0):
    logits = logits / temperature
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    return torch.multinomial(probabilities, 1).item()


def generate(custom_dataset, encoder, decoder, model, sentence, max_gen_len=50):
    model.eval()
    
    tokens = custom_dataset.preprocess_text(sentence)
    
    eng_indices = [custom_dataset.eng_wtoi.get(token, custom_dataset.eng_wtoi["<unk>"]) for token in tokens]
    
    source_tensor = torch.tensor(eng_indices).unsqueeze(0).to(model.device)  # (1, seq_len)
    source_embeddings = model.en_embedding(source_tensor)  
    hn,cn = encoder.forward(source_embeddings)
    fr_sos_index = custom_dataset.fr_wtoi.get("<sos>")
    fr_sos_embd = model.fr_embedding(torch.tensor([[fr_sos_index]]).to(model.device))
    print(fr_sos_embd.shape,"\n")
    len_count = 0
    prev_embedding = fr_sos_embd
    words = []
    while(True):
        logits,hn,cn = decoder.forward(prev_embedding,hn,cn)
        logits = logits.argmax(dim=-1)
        max_probable_word_index = sample_with_temperature(logits[0], temperature=0.7)
        prev_embedding = model.fr_embedding(torch.tensor([[max_probable_word_index]]).to(model.device))
        word = custom_dataset.fr_itow.get(max_probable_word_index,"<unk>")
        if(word=="<eos>"):
            break
        words.append(word)
        len_count+=1
        if(len_count>=max_gen_len):
            break
    return " ".join(words),len(words)

In [None]:
generate(custom_dataset, encoder, decoder, model, "i did see something.")

# Using the loaded model

In [None]:
def inference(model,encoder,decoder,dataloader):
    results={"english_sentence":[],
             "original_translation":[],
             "produced_translation":[]}
    
    for sources,targets in dataloader:

        target_sentences = [[custom_dataset.fr_itow.get(index.item()) for index in sentence] for sentence in targets]
        source_sentences = [[custom_dataset.eng_itow.get(index.item()) for index in sentence] for sentence in sources]
        results["original_translation"].extend(target_sentences)
        results["english_sentence"].extend(source_sentences)
            
        source_embds = model.en_embedding(sources.to(model.device)).to(model.device)
        target_embds = model.fr_embedding(targets.to(model.device)).to(model.device)
    
        hn,cn = encoder.forward(source_embds)
        target_batch_starting = target_embds[:,0,:]
        seq_length = target_embds.shape[1]
        inputs = target_batch_starting
        generated_outputs = []
        
        for i in range(1,seq_length):
            decoder_lstm_output , hn , cn = decoder.forward(inputs.unsqueeze(1),hn,cn)
            inputs = target_embds[:,i,:]
            generated_outputs.append(decoder_lstm_output)
        
        generated_outputs = torch.stack(generated_outputs)
        generated_outputs = generated_outputs.permute(2,1,0,3).squeeze(0)
    
        probable_indexes = generated_outputs.argmax(dim=-1)
    
        produced_sentences = []
        for sentence in probable_indexes:
            sntc = []
            for word_index in sentence:
                sntc.append(custom_dataset.fr_itow.get(word_index))
            produced_sentences.append(sntc)
        results["produced_translation"].append(produced_sentences)

    return results

In [None]:
with open("/kaggle/input/seq2seq-results/my_dict.pkl", "rb") as f:
    my_dict = pickle.load(f)

In [None]:
def clean_and_join_sentences(result_dict):
    cleaned_results = {}
    remove_tokens = {"<sos>", "<eos>", "<pad>", "<unk>"}

    for key, sentences in result_dict.items():
        cleaned_sentences = []
        for sentence in sentences:
            cleaned = [word for word in sentence if word not in remove_tokens]
            cleaned_sentences.append(" ".join(cleaned))
        cleaned_results[key] = cleaned_sentences

    return cleaned_results

In [None]:
my_dict = clean_and_join_sentences(my_dict)

In [None]:
dataframe = pd.DataFrame(my_dict)
dataframe

In [4]:
def compute_bleu(references, hypotheses):
    references = [[ref.split()] for ref in references]  
    hypotheses = [hyp.split() for hyp in hypotheses]    

    smoothie = SmoothingFunction().method4 

    score = corpus_bleu(references, hypotheses, smoothing_function=smoothie)
    return score

bleu_score = compute_bleu(
    my_dict["original_translation"],
    my_dict["produced_translation"]
)

print(f"\nTest Corpus BLEU score: {bleu_score:.4f}")


Test Corpus BLEU score: 0.3466
