<a href="https://colab.research.google.com/github/IwohubMedia/Machine-Translation-AI-project/blob/main/Machine_Translation_with_Seq2Seq_%2B_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==========================
# Machine Translation (Seq2Seq with Attention) - Full Project
# ==========================

!pip install torchtext torch spacy -q
!python -m spacy download en_core_web_sm -q
!python -m spacy download de_core_news_sm -q

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.metrics import bleu_score

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --------------------------
# Dataset & Preprocessing
# --------------------------
train_iter, valid_iter, test_iter = Multi30k(language_pair=('en', 'de'))

tokenizer_en = get_tokenizer("spacy", language="en_core_web_sm")
tokenizer_de = get_tokenizer("spacy", language="de_core_news_sm")

def yield_tokens(data_iter, tokenizer, index):
    for src, tgt in data_iter:
        yield tokenizer(src) if index == 0 else tokenizer(tgt)

vocab_src = build_vocab_from_iterator(yield_tokens(train_iter, tokenizer_en, 0), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab_tgt = build_vocab_from_iterator(yield_tokens(train_iter, tokenizer_de, 1), specials=["<unk>", "<pad>", "<bos>", "<eos>"])

vocab_src.set_default_index(vocab_src["<unk>"])
vocab_tgt.set_default_index(vocab_tgt["<unk>"])

def process_sentence(sentence, vocab, tokenizer):
    tokens = ["<bos>"] + tokenizer(sentence.lower()) + ["<eos>"]
    return torch.tensor([vocab[token] for token in tokens], dtype=torch.long)

def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src, tgt in batch:
        src_batch.append(process_sentence(src, vocab_src, tokenizer_en))
        tgt_batch.append(process_sentence(tgt, vocab_tgt, tokenizer_de))
    src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=vocab_src["<pad>"])
    tgt_batch = nn.utils.rnn.pad_sequence(tgt_batch, padding_value=vocab_tgt["<pad>"])
    return src_batch, tgt_batch

from torch.utils.data import DataLoader
BATCH_SIZE = 32
train_loader = DataLoader(list(train_iter), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(list(valid_iter), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(list(test_iter),  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# --------------------------
# Seq2Seq Model with Attention
# --------------------------
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[0]
        hidden = hidden[-1].unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs.permute(1, 0, 2)), dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1)

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return outputs, hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(hid_dim + emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim * 2 + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.attention = attention

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs)
        a = a.unsqueeze(1)
        weighted = torch.bmm(a, encoder_outputs.permute(1, 0, 2))
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        prediction = self.fc_out(torch.cat((output.squeeze(0), weighted.squeeze(0), embedded.squeeze(0)), dim=1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[t] = output
            top1 = output.argmax(1)
            input = trg[t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs

# --------------------------
# Training & Evaluation
# --------------------------
INPUT_DIM = len(vocab_src)
OUTPUT_DIM = len(vocab_tgt)
ENC_EMB_DIM, DEC_EMB_DIM, HID_DIM = 256, 256, 512
N_LAYERS, ENC_DROPOUT, DEC_DROPOUT = 2, 0.5, 0.5

attn = Attention(HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, attn)
model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)

optimizer = optim.Adam(model.parameters())
PAD_IDX = vocab_tgt["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

def train_epoch(model, iterator, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    for src, tgt in iterator:
        src, tgt = src.to(DEVICE), tgt.to(DEVICE)
        optimizer.zero_grad()
        output = model(src, tgt)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        tgt = tgt[1:].view(-1)
        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, tgt in iterator:
            src, tgt = src.to(DEVICE), tgt.to(DEVICE)
            output = model(src, tgt, 0)  # no teacher forcing
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            tgt = tgt[1:].view(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    valid_loss = evaluate(model, valid_loader, criterion)
    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.3f} | Valid Loss: {valid_loss:.3f}")

# --------------------------
# Translation & BLEU Score
# --------------------------
def translate_sentence(sentence, model, vocab_src, vocab_tgt, tokenizer, max_len=50):
    model.eval()
    tokens = ["<bos>"] + tokenizer(sentence.lower()) + ["<eos>"]
    src_indexes = [vocab_src[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(DEVICE)
    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src_tensor)
    trg_indexes = [vocab_tgt["<bos>"]]
    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(DEVICE)
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell, encoder_outputs)
            pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        if pred_token == vocab_tgt["<eos>"]:
            break
    trg_tokens = [vocab_tgt.lookup_token(i) for i in trg_indexes]
    return trg_tokens[1:-1]

print("Translation Example:", translate_sentence("I love machine translation.", model, vocab_src, vocab_tgt, tokenizer_en))

def calculate_bleu(data, model, vocab_src, vocab_tgt, tokenizer, max_len=50):
    trgs, pred_trgs = [], []
    for src, tgt in data:
        pred_tokens = translate_sentence(src, model, vocab_src, vocab_tgt, tokenizer, max_len)
        pred_trgs.append(pred_tokens)
        tgt_tokens = [vocab_tgt.lookup_token(idx) for idx in tgt.tolist() if idx not in [vocab_tgt["<pad>"], vocab_tgt["<bos>"], vocab_tgt["<eos>"]]]
        trgs.append([tgt_tokens])
    return bleu_score(pred_trgs, trgs)

bleu = calculate_bleu(list(test_iter), model, vocab_src, vocab_tgt, tokenizer_en)
print(f"BLEU Score = {bleu*100:.2f}")

# --------------------------
# Save Model
# --------------------------
torch.save(model.state_dict(), "mt_model.pt")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m93.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package

OSError: /usr/local/lib/python3.12/dist-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch6detail10class_baseC2ERKSsS3_SsRKSt9type_infoS6_