# Twi-to-English Neural Machine Translation with Attention

## Overview
This tutorial demonstrates how to build a neural machine translation model that translates from Twi (a Ghanaian language) to English using PyTorch. The model uses a **Sequence-to-Sequence (Seq2Seq) architecture** with an **attention mechanism**.

### Key Components:
1. **Encoder**
2. **Attention Mechanism**
3. **Decoder**
4. **Teacher Forcing**

## Part 1: Data Preparation and Tokenization

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random

twi_sentences = [
    "Me din de John", "Ɛte sɛn?", "Meda wo ase", "Me pɛ sɛ me kɔ fie", 
    "Ɛhɔ yɛ fɛ", "Me dɔ wo", "Aduane no yɛ dɛ", "Nnipa no reba", 
    "Mepɛ sɛ mesua Twi", "Ɛnnɛ yɛ Memeneda"
]

english_sentences = [
    "My name is John", "How are you?", "Thank you", "I want to go home", 
    "It is beautiful there", "I love you", "The food is delicious", 
    "The people are coming", "I want to learn Twi", "Today is Saturday"
]

In [None]:
def tokenize(sentences):
    vocab = set()
    for sentence in sentences:
        for word in sentence.lower().split():
            vocab.add(word)
    vocab.update(['<pad>', '<sos>', '<eos>', '<unk>'])
    word2idx = {word: idx for idx, word in enumerate(sorted(vocab))}
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

twi_word2idx, twi_idx2word = tokenize(twi_sentences)
eng_word2idx, eng_idx2word = tokenize(english_sentences)

twi_vocab_size = len(twi_word2idx)
eng_vocab_size = len(eng_word2idx)

print(f"Twi vocabulary size: {twi_vocab_size}")
print(f"English vocabulary size: {eng_vocab_size}")

## Part 2: Dataset Class and Data Loading

In [None]:
def sentence_to_indices(sentence, word2idx):
    return [word2idx.get(word.lower(), word2idx['<unk>']) for word in sentence.split()]

class TranslationDataset(Dataset):
    def __init__(self, twi_sentences, eng_sentences, twi_word2idx, eng_word2idx):
        self.twi_data = []
        self.eng_data = []
        for twi_sentence, eng_sentence in zip(twi_sentences, eng_sentences):
            twi_indices = sentence_to_indices(twi_sentence, twi_word2idx)
            eng_indices = [eng_word2idx['<sos>']] + sentence_to_indices(eng_sentence, eng_word2idx) + [eng_word2idx['<eos>']]
            self.twi_data.append(twi_indices)
            self.eng_data.append(eng_indices)

    def __len__(self):
        return len(self.twi_data)

    def __getitem__(self, idx):
        return self.twi_data[idx], self.eng_data[idx]

In [None]:
def collate_fn(batch):
    twi_batch, eng_batch = [], []
    twi_lengths, eng_lengths = [], []
    for twi_item, eng_item in batch:
        twi_batch.append(twi_item)
        eng_batch.append(eng_item)
        twi_lengths.append(len(twi_item))
        eng_lengths.append(len(eng_item))

    max_twi_len = max(twi_lengths)
    max_eng_len = max(eng_lengths)

    twi_padded = [x + [twi_word2idx['<pad>']] * (max_twi_len - len(x)) for x in twi_batch]
    eng_padded = [x + [eng_word2idx['<pad>']] * (max_eng_len - len(x)) for x in eng_batch]

    return torch.LongTensor(twi_padded), torch.LongTensor(eng_padded), torch.LongTensor(twi_lengths), torch.LongTensor(eng_lengths)

dataset = TranslationDataset(twi_sentences, english_sentences, twi_word2idx, eng_word2idx)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)

## Part 3: Model Architecture

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.repeat(seq_len, 1, 1).permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1)

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, attention):
        super(Decoder, self).__init__()
        self.output_size = output_size
        self.attention = attention
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.lstm = nn.LSTM(hidden_size + embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x, hidden, cell, encoder_outputs):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        attention_weights = self.attention(hidden, encoder_outputs).unsqueeze(1)
        context = torch.bmm(attention_weights, encoder_outputs)
        lstm_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        output = torch.cat((output.squeeze(1), context.squeeze(1)), dim=1)
        prediction = self.fc(output)
        return prediction, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.shape
        trg_vocab_size = self.decoder.output_size
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        decoder_input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(decoder_input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            decoder_input = trg[:, t] if teacher_force else top1
        return outputs

## Part 4: Training Setup and Loop

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

EMBEDDING_SIZE = 32
HIDDEN_SIZE = 64
LEARNING_RATE = 0.001
NUM_EPOCHS = 100

encoder = Encoder(twi_vocab_size, EMBEDDING_SIZE, HIDDEN_SIZE).to(device)
attention = Attention(HIDDEN_SIZE).to(device)
decoder = Decoder(eng_vocab_size, EMBEDDING_SIZE, HIDDEN_SIZE, attention).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=eng_word2idx['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(f"Model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters")

In [None]:
for epoch in range(NUM_EPOCHS):
    model.train()
    epoch_loss = 0
    for i, (src, trg, src_len, trg_len) in enumerate(dataloader):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {epoch_loss / len(dataloader):.4f}")

## Part 5: Saving Model and Translation

In [None]:
torch.save({
    'encoder': encoder.state_dict(),
    'decoder': decoder.state_dict(),
    'twi_word2idx': twi_word2idx,
    'twi_idx2word': twi_idx2word,
    'eng_word2idx': eng_word2idx,
    'eng_idx2word': eng_idx2word,
}, 'twi_to_english_model.pt')

In [None]:
def translate_sentence(model, sentence, twi_word2idx, twi_idx2word, eng_word2idx, eng_idx2word, device, max_length=50):
    model.eval()
    tokens = sentence.split()
    indices = [twi_word2idx.get(token.lower(), twi_word2idx['<unk>']) for token in tokens]
    src_tensor = torch.LongTensor(indices).unsqueeze(0).to(device)
    encoder_outputs, hidden, cell = model.encoder(src_tensor)
    trg_idx = [eng_word2idx['<sos>']]
    for _ in range(max_length):
        trg_tensor = torch.LongTensor([trg_idx[-1]]).to(device)
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell, encoder_outputs)
        pred_token = output.argmax(1).item()
        trg_idx.append(pred_token)
        if pred_token == eng_word2idx['<eos>']:
            break
    translated = [eng_idx2word[idx] for idx in trg_idx if idx not in [eng_word2idx['<sos>'], eng_word2idx['<eos>'], eng_word2idx['<pad>']]]
    return ' '.join(translated)

## Part 6: Testing the Model

In [None]:
print("Testing the model:")
print("=" * 60)

for i in range(len(twi_sentences)):
    twi_sent = twi_sentences[i]
    eng_sent = english_sentences[i]
    translation = translate_sentence(model, twi_sent, twi_word2idx, twi_idx2word, eng_word2idx, eng_idx2word, device)
    print(f"Twi: {twi_sent}")
    print(f"Expected: {eng_sent}")
    print(f"Translated: {translation}")
    print("-" * 50)