# Machine Translation for English to French using LSTM based Encoder-Decoder with Attention

## Import Required Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import pandas as pd
import spacy
from collections import Counter
from sklearn.model_selection import train_test_split

In [None]:
# !python -m spacy download en_core_web_sm 
# !python -m spacy download fr_core_news_sm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Loading The Data

In [3]:
data_path = r"E:\3_MY_Current_Work\1_Krish_Naik\3_NLP_GEN_AI\0.My__Practice\5.Encoder_Decoder_With_Attention\english_french_train.csv"
data = pd.read_csv(data_path)

In [4]:
data.head()

Unnamed: 0,english,french
0,My name is John,Je m'appelle John
1,What book are you reading?,Quel livre lis-tu ?
2,I have a dog,J’ai un chien
3,This is my mother,C’est ma mère
4,It’s big,C’est grand


In [5]:
data.columns

Index(['english', 'french'], dtype='object')

## Preprocessing of the data

### Tokonizing the Data

In [6]:
eng_tokenizer = spacy.load("en_core_web_sm")
fre_tokenizer = spacy.load("fr_core_news_sm")

In [7]:
def tokenize(text, tokenizer):
    return [tok.text.lower() for tok in tokenizer(text)]

### Making Vocabulary

In [9]:
def build_vocab(sentences, tokenizer, min_freq=1):
    counter = Counter()
    for sentence in sentences:
        counter.update(tokenize(sentence, tokenizer))

    vocab = {'<sos>': 0, '<eos>': 1, '<pad>': 2, '<unk>': 3}
    for word, freq in counter.items():
        if freq >= min_freq and word not in vocab:
            vocab[word] = len(vocab)
    return vocab

### Encoding the Data

In [10]:
def encode(text, vocab, tokenizer):
    tokens = tokenize(text, tokenizer)
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

### Preparing the Dataset

In [11]:
class TranslationDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src_text, tgt_text = self.pairs[idx]
        src_ids = [self.src_vocab['<sos>']] + encode(src_text, self.src_vocab, eng_tokenizer) + [self.src_vocab['<eos>']]
        tgt_ids = [self.tgt_vocab['<sos>']] + encode(tgt_text, self.tgt_vocab, fre_tokenizer) + [self.tgt_vocab['<eos>']]
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

In [12]:
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=2)
    tgt_batch = pad_sequence(tgt_batch, padding_value=2)
    return src_batch, tgt_batch

In [13]:
pairs = list(zip(data['english'], data['french']))
train_pairs, test_pairs = train_test_split(pairs, test_size=0.2, random_state=42)


In [14]:
src_vocab = build_vocab([src for src, _ in train_pairs], eng_tokenizer)
tgt_vocab = build_vocab([tgt for _, tgt in train_pairs], fre_tokenizer)
print("English Vocabulary Size:", len(src_vocab))
print("French Vocabulary Size:", len(tgt_vocab))

English Vocabulary Size: 174
French Vocabulary Size: 199


In [15]:
train_dataset = TranslationDataset(train_pairs, src_vocab, tgt_vocab)
test_dataset = TranslationDataset(test_pairs, src_vocab, tgt_vocab)

In [16]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

## Bulding the Network

### The Encoder Part

In [17]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers=1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers)
        self.hidden_dim = hidden_dim

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.rnn(embedded)
        return outputs, hidden, cell


### Attention part

In [18]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_dim, dec_hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(enc_hidden_dim + dec_hidden_dim, dec_hidden_dim)
        self.v = nn.Linear(dec_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[0]
        hidden = hidden[-1].unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1)


### Decoder part

In [19]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, attention):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim + hidden_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim * 2, output_dim)
        self.attention = attention

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.embedding(input)
        attn_weights = self.attention(hidden, encoder_outputs)
        attn_weights = attn_weights.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        context = torch.bmm(attn_weights, encoder_outputs).permute(1, 0, 2)
        rnn_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        prediction = self.fc_out(torch.cat((output.squeeze(0), context.squeeze(0)), dim=1))
        return prediction, hidden, cell, attn_weights


### Seq2Seq Part

In [20]:
import random

In [21]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.src_pad_idx = src_pad_idx

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.embedding.num_embeddings

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)

        input = trg[0, :]  # <sos>
        for t in range(1, trg_len):
            output, hidden, cell, _ = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[t] = output
            top1 = output.argmax(1)
            input = trg[t] if random.random() < teacher_forcing_ratio else top1
        return outputs


### Training

In [22]:
def train_model(model, dataloader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    for src, trg in dataloader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].reshape(-1, output_dim)
        trg = trg[1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

## Inference

In [23]:
def translate_sentence(model, sentence, src_vocab, trg_vocab, max_len=50):
    model.eval()
    tokens = ['<sos>'] + tokenize(sentence, eng_tokenizer) + ['<eos>']
    src_ids = [src_vocab.get(tok, src_vocab['<unk>']) for tok in tokens]
    src_tensor = torch.tensor(src_ids).unsqueeze(1).to(device)
    encoder_outputs, hidden, cell = model.encoder(src_tensor)

    trg_indexes = [trg_vocab['<sos>']]
    attentions = []
    for _ in range(max_len):
        trg_tensor = torch.tensor([trg_indexes[-1]]).to(device)
        with torch.no_grad():
            output, hidden, cell, attn = model.decoder(trg_tensor, hidden, cell, encoder_outputs)
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        if pred_token == trg_vocab['<eos>']:
            break
    trg_vocab_inv = {i: w for w, i in trg_vocab.items()}
    return [trg_vocab_inv[idx] for idx in trg_indexes[1:]]



In [24]:
import nltk

In [None]:
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\naeem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [40]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

### Evaluation using Bleu Score

In [41]:
def calculate_bleu(pairs, model, src_vocab, tgt_vocab, max_len=50):
    targets = []
    predictions = []

    for src_text, tgt_text in pairs:
        pred_tokens = translate_sentence(model, src_text, src_vocab, tgt_vocab, max_len)
        tgt_tokens = tokenize(tgt_text, fre_tokenizer)

        predictions.append(pred_tokens)       
        targets.append([tgt_tokens])         

    smoothie = SmoothingFunction().method4

    bleu = corpus_bleu(targets, predictions, smoothing_function=smoothie)
    return bleu


### Setting the parametrs

In [29]:
INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(tgt_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HIDDEN_DIM = 512

In [30]:
attn = Attention(HIDDEN_DIM, HIDDEN_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM, attn)
model = Seq2Seq(enc, dec, src_pad_idx=src_vocab['<pad>'], device=device).to(device)

In [31]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=src_vocab['<pad>'])

In [32]:
# Train for 5 epochs
for epoch in range(1, 6):
    loss = train_model(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch}: Loss = {loss:.4f}")


Epoch 1: Loss = 4.6257
Epoch 2: Loss = 3.6873
Epoch 3: Loss = 3.2235
Epoch 4: Loss = 2.7298
Epoch 5: Loss = 2.2759


In [42]:
# BLEU Evaluation
test_data = list(zip(data['english'][:100], data['french'][:100]))  # use small sample
bleu = calculate_bleu(test_data, model, src_vocab, tgt_vocab)
print(f"BLEU Score: {bleu*100:.2f}")

BLEU Score: 5.44


In [43]:
# Inference Example
print("Sample Inference:")
sample_text = "Where is the train station?"
translated = translate_sentence(model, sample_text, src_vocab, tgt_vocab)
print(f"Input: {sample_text}")
print(f"Output: {' '.join(translated)}")

Sample Inference:
Input: Where is the train station?
Output: où est l’ ? ? <eos>
