# English to French Translator using Encoder-Decoder Model

## Import Rwquired Libraries

In [67]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab

import spacy
import numpy
import random

from collections import Counter

In [None]:
# For downloading these Toeknizers from spacy, Uncomment the following lines
# !python -m spacy download en_core_web_sm
# !python -m spacy download fr_core_news_sm

In [68]:
print(torch.__version__)      # Should be 2.6.0
print(torchtext.__version__) 

2.2.2
0.17.2


## setting up seed and device

In [69]:
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

## Need Dataset

In [70]:
data = [
    ("I love coding.", "J'aime coder."),
    ("The cat is on the mat.", "Le chat est sur le tapis."),
    ("She reads a book.", "Elle lit un livre.")
]

print(data)

[('I love coding.', "J'aime coder."), ('The cat is on the mat.', 'Le chat est sur le tapis.'), ('She reads a book.', 'Elle lit un livre.')]


## preprocessing the Data

### Tokenizers

In [71]:
en_tokenizer = get_tokenizer("spacy", "en_core_web_sm")
fr_tokenizer = get_tokenizer("spacy", "fr_core_news_sm")

In [72]:
print(en_tokenizer)
print(fr_tokenizer)

functools.partial(<function _spacy_tokenize at 0x000001FA4989C940>, spacy=<spacy.lang.en.English object at 0x000001FA094AA140>)
functools.partial(<function _spacy_tokenize at 0x000001FA4989C940>, spacy=<spacy.lang.fr.French object at 0x000001FA094A9EA0>)


### Making Vocabulary

In [73]:
# Function for making vocab from sentances
def build_vocab(sentences, tokenizer):
    counter = Counter()
    for sentence in sentences:
        tokens = tokenizer(sentence)
        counter.update(tokens)
    
    specials = ["<unk>", "<pad>", "<start>", "<end>"]
    vocab_obj = vocab(counter, specials=specials)
    vocab_obj.set_default_index(vocab_obj["<unk>"])
    
    return vocab_obj

### Seperating English and Frenc Sentances

In [74]:
eng_sentances , fre_sentances = zip(*data)
print("ENglisj Sentances are : ", eng_sentances)
print("French Sentances are : ", fre_sentances)

ENglisj Sentances are :  ('I love coding.', 'The cat is on the mat.', 'She reads a book.')
French Sentances are :  ("J'aime coder.", 'Le chat est sur le tapis.', 'Elle lit un livre.')


### Making English and French Vocabulary 

In [75]:
eng_vocab = build_vocab(eng_sentances,en_tokenizer)
print("English Vocab : ", eng_vocab.get_itos()[:50])
fre_vocab = build_vocab(fre_sentances,fr_tokenizer)
print("French Vocab : ", fre_vocab.get_itos()[:50])

English Vocab :  ['<unk>', '<pad>', '<start>', '<end>', 'I', 'love', 'coding', '.', 'The', 'cat', 'is', 'on', 'the', 'mat', 'She', 'reads', 'a', 'book']
French Vocab :  ['<unk>', '<pad>', '<start>', '<end>', "J'", 'aime', 'coder', '.', 'Le', 'chat', 'est', 'sur', 'le', 'tapis', 'Elle', 'lit', 'un', 'livre']


In [30]:
# unk_idx = eng_vocab["<unk>"]
# print(unk_idx)
# tokens = en_tokenizer("some unknown sentence")
# print(tokens)
# indices = [eng_vocab[token] if token in eng_vocab.get_stoi() else unk_idx for token in tokens]
# print(indices)

### Converting To Indices

In [76]:
def convert_to_indices(sentance, tokenizer, vocab):
    tokens = tokenizer(sentance)
    indices = [vocab["<start>"]] + [vocab[token] for token in tokens] + [vocab['<end>']]
    return indices

In [77]:
eng_indices = [convert_to_indices(sent,en_tokenizer, eng_vocab) for sent in eng_sentances]
print(eng_indices)
fre_indices = [convert_to_indices(sent,fr_tokenizer, fre_vocab) for sent in fre_sentances]
print(fre_indices)

[[2, 4, 5, 6, 7, 3], [2, 8, 9, 10, 11, 12, 13, 7, 3], [2, 14, 15, 16, 17, 7, 3]]
[[2, 4, 5, 6, 7, 3], [2, 8, 9, 10, 11, 12, 13, 7, 3], [2, 14, 15, 16, 17, 7, 3]]


### Padding for Equal length Sequences

In [78]:
def padding(sequences, max_length=None):
    if max_length is None:
        max_length = max(len(seq) for seq in sequences)
    padded = [seq + [0] * (max_length - len(seq)) for seq in sequences]
    return padded, max_length

In [79]:
eng_sequences, max_len_eng = padding(eng_indices)
print("eng_sequences:", eng_sequences)
fre_sequences, max_len_fre = padding(fre_indices)
print("fre_sequences:", fre_sequences)

eng_sequences: [[2, 4, 5, 6, 7, 3, 0, 0, 0], [2, 8, 9, 10, 11, 12, 13, 7, 3], [2, 14, 15, 16, 17, 7, 3, 0, 0]]
fre_sequences: [[2, 4, 5, 6, 7, 3, 0, 0, 0], [2, 8, 9, 10, 11, 12, 13, 7, 3], [2, 14, 15, 16, 17, 7, 3, 0, 0]]


### Converting these sequences to Tensors

In [80]:
eng_data = torch.tensor(eng_sequences, dtype = torch.long).to(device)
print(eng_data)
print("shape of the English",eng_data.shape)
fre_data = torch.tensor(fre_sequences,dtype = torch.long).to(device)
print(fre_data)
print(" Shape of the french : ",fre_data.shape)

tensor([[ 2,  4,  5,  6,  7,  3,  0,  0,  0],
        [ 2,  8,  9, 10, 11, 12, 13,  7,  3],
        [ 2, 14, 15, 16, 17,  7,  3,  0,  0]])
shape of the English torch.Size([3, 9])
tensor([[ 2,  4,  5,  6,  7,  3,  0,  0,  0],
        [ 2,  8,  9, 10, 11, 12, 13,  7,  3],
        [ 2, 14, 15, 16, 17,  7,  3,  0,  0]])
 Shape of the french :  torch.Size([3, 9])


## Building the Model

### Encoder Part

In [89]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, embedding_dim, no_layers):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, no_layers, batch_first=True)

    def forward(self, x):
        x= self.embedding(x)
        _,(hidden, cell) = self.lstm(x)
        return hidden, cell

### Decoder Part

In [90]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, embedding_dim, num_layers):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        x = self.embedding(x)
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

### Seq_to_Seq Model

In [91]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tar, teacher_forcing=0.5):
        batch_size = src.shape[0]
        tar_len = tar.shape[1]
        tar_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, tar_len, tar_vocab_size).to(device)

        hidden, cell = self.encoder(src)

        input = tar[: ,0]
        for i in range(1,tar_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, i] = output
            teacher_force = random.random() < teacher_forcing
            top1 = output.argmax(1)
            input = tar[:, i] if teacher_force else top1
        return outputs
 

### Model Parameters

In [92]:
input_dim = len(eng_vocab)
output_dim = len(fre_vocab)
embedded_dim = 256
hidden_dim = 512
no_layers = 2

In [93]:
enc = Encoder(input_dim, hidden_dim, embedded_dim, no_layers).to(device)
dec = Decoder(output_dim, hidden_dim, embedded_dim, no_layers).to(device)
model = Seq2seq(enc, dec).to(device)

## Training the Model

In [94]:
criterion = nn.CrossEntropyLoss(ignore_index = fre_vocab["pad"])
optimizer = optim.Adam(model.parameters())

In [95]:
def training(model, src_data, tar_data, optimizer, creiterion, epochs=10):
    model.train()
    for i in range(epochs):
        optimizer.zero_grad()
        output = model(src_data, tar_data)

        output_dim = output.shape[-1]
        output = output[:, 1:, :].reshape(-1, output_dim)
        tar = tar_data[:, 1:].reshape(-1)
        
        loss = creiterion(output, tar)
        loss.backward()
        optimizer.step()
        print(f"Epoch {i+1}/{epochs}, Loss: {loss.item():.4f}")

In [96]:
training(model, eng_data, fre_data, optimizer, criterion, 10)

Epoch 1/10, Loss: 2.8911
Epoch 2/10, Loss: 2.7885
Epoch 3/10, Loss: 2.6030
Epoch 4/10, Loss: 2.2839
Epoch 5/10, Loss: 1.9171
Epoch 6/10, Loss: 1.5600
Epoch 7/10, Loss: 1.2590
Epoch 8/10, Loss: 1.0053
Epoch 9/10, Loss: 0.7470
Epoch 10/10, Loss: 0.6053


## Making Inference

In [103]:
def translate_sentence(sentence, model, en_vocab, fr_vocab, en_tokenizer, device, max_len=50):
    model.eval()
    tokens = en_tokenizer(sentence)
    indices = [en_vocab["<start>"]] + [
        en_vocab[token] if token in en_vocab.get_stoi() else en_vocab["<unk>"]
        for token in tokens
    ] + [en_vocab["<end>"]]

    src_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)

    trg_indices = [fr_vocab["<start>"]]
    for _ in range(max_len):
        trg_tensor = torch.tensor([trg_indices[-1]], dtype=torch.long).to(device)
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
        pred_token = output.argmax(1).item()
        trg_indices.append(pred_token)
        if pred_token == fr_vocab["<end>"]:
            break

    trg_tokens = [fr_vocab.get_itos()[i] for i in trg_indices[1:-1]]
    return " ".join(trg_tokens)


In [104]:
# Test translation
test_sentence = "I love coding."
translation = translate_sentence(test_sentence, model, eng_vocab, fre_vocab, en_tokenizer, device)
print(f"Input: {test_sentence}")
print(f"Translation: {translation}")

Input: I love coding.
Translation: J' coder .


## Evluation of the Model

In [106]:
for en_sent, fr_sent in data[:2]:
    translation = translate_sentence(en_sent, model, eng_vocab, fre_vocab, en_tokenizer, device)
    print(f"Input: {en_sent}")
    print(f"Reference: {fr_sent}")
    print(f"Predicted: {translation}\n")

Input: I love coding.
Reference: J'aime coder.
Predicted: J' coder .

Input: The cat is on the mat.
Reference: Le chat est sur le tapis.
Predicted: Le chat est sur le tapis



In [107]:
from torchtext.data.metrics import bleu_score
import torch
from tqdm import tqdm

# Assumes translate_sentence is already defined as we fixed earlier
# Assumes test_data is a list of (src_sentence, trg_sentence) tuples

def evaluate_bleu(model, data, en_vocab, fr_vocab, en_tokenizer, fr_tokenizer, device):
    """
    Evaluate the model on test data and compute BLEU score.

    Args:
        model: Trained Seq2Seq model.
        data: List of tuples (src_sentence, trg_sentence) as raw strings.
        en_vocab: Source vocabulary (English).
        fr_vocab: Target vocabulary (French).
        en_tokenizer: Tokenizer function for English.
        fr_tokenizer: Tokenizer function for French.
        device: 'cuda' or 'cpu'

    Returns:
        BLEU score as float.
    """
    model.eval()
    hypotheses = []
    references = []

    for src_sentence, trg_sentence in tqdm(data, desc="Evaluating"):
        pred = translate_sentence(
            src_sentence, model, en_vocab, fr_vocab, en_tokenizer, device
        )

        # Tokenize predicted and actual sentences
        pred_tokens = pred.strip().split()
        trg_tokens = fr_tokenizer(trg_sentence.strip())

        hypotheses.append(pred_tokens)
        references.append([trg_tokens])  # Note: list of references

    bleu = bleu_score(hypotheses, references)
    print(f"BLEU score = {bleu * 100:.2f}")
    return bleu


In [108]:
# Example test data
test_data = [
    ("I love cats.", "J'aime les chats."),
    ("She is reading a book.", "Elle lit un livre."),
    ("We are going to the park.", "Nous allons au parc."),
]

bleu = evaluate_bleu(model, test_data, eng_vocab, fre_vocab, en_tokenizer, fr_tokenizer, device)


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 23.99it/s]


BLEU score = 0.00
