# Seq2Seq Translation with LSTM  
### English to French Machine Translation

### Authors:
* Hu·ª≥nh Anh Nh·ª±t
* Nguy·ªÖn Ti·∫øn Minh

**Bao g·ªìm:**
- Chu·∫©n b·ªã d·ªØ li·ªáu (raw ‚Üí processed)
- Encoder‚ÄìDecoder LSTM
- Training loop (teacher forcing)
- Inference (translate function)
- Evaluation (BLEU score)
- 5 v√≠ d·ª• d·ªãch + ph√¢n t√≠ch l·ªói

---

### Import th∆∞ vi·ªán 

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from nltk.translate.bleu_score import sentence_bleu
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
import random
from processed_data import (en_tokenizer, fr_tokenizer,build_vocab, load_parallel, ParallelDataset, make_collate_fn, encode_sentence_en, encode_sentence_fr, save_vocab, load_vocab, save_dataset_pytorch, load_dataset_pytorch)

# 1. Load dataset

### T·∫°o dataset m·ªõi t·ª´ raw

In [7]:
# train_pairs = load_parallel('./data/raw/train.en', './data/raw/train.fr')
# val_pairs = load_parallel('./data/raw/val.en', './data/raw/val.fr')
# test_pairs = load_parallel('./data/raw/test_2016_flickr.en', './data/raw/test_2016_flickr.fr')

# vocab_en = build_vocab(train_pairs, lang='en', max_tokens=10000, min_freq=2)
# vocab_fr = build_vocab(train_pairs, lang='fr', max_tokens=10000, min_freq=2)

# save_vocab(vocab_en, './data/processed/vocab_en.pkl')
# save_vocab(vocab_fr, './data/processed/vocab_fr.pkl')

# train_ds = ParallelDataset(train_pairs, vocab_en, vocab_fr)
# val_ds = ParallelDataset(val_pairs, vocab_en, vocab_fr)
# test_ds = ParallelDataset(test_pairs, vocab_en, vocab_fr)

# save_dataset_pytorch(train_ds, './data/processed/train_ds.pt')
# save_dataset_pytorch(val_ds, './data/processed/val_ds.pt')
# save_dataset_pytorch(test_ds, './data/processed/test_ds.pt')

### Load dataset ƒë√£ qua x·ª≠ l√Ω

In [4]:
vocab_en = load_vocab('./data/processed/vocab_en.pkl')
vocab_fr = load_vocab('./data/processed/vocab_fr.pkl')

train_ds = load_dataset_pytorch('./data/processed/train_dataset.pt')
valid_ds = load_dataset_pytorch('./data/processed/valid_dataset.pt')
test_ds = load_dataset_pytorch('./data/processed/test_dataset.pt')

collate_fn = make_collate_fn(vocab_en, vocab_fr)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_ds, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, collate_fn=collate_fn)

# 2. LSTM Model

In [5]:
class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.W_i = nn.Linear(input_size, hidden_size, bias=True)
        self.U_i = nn.Linear(hidden_size, hidden_size, bias=True)
        self.W_f = nn.Linear(input_size, hidden_size, bias=True)
        self.U_f = nn.Linear(hidden_size, hidden_size, bias=True)
        self.W_o = nn.Linear(input_size, hidden_size, bias=True)
        self.U_o = nn.Linear(hidden_size, hidden_size, bias=True)
        self.W_c = nn.Linear(input_size, hidden_size, bias=True)
        self.U_c = nn.Linear(hidden_size, hidden_size,  bias=True)

    def forward(self, x, h_prev, c_prev):
        i_t = torch.sigmoid(self.W_i(x) + self.U_i(h_prev))
        f_t = torch.sigmoid(self.W_f(x) + self.U_f(h_prev))
        o_t = torch.sigmoid(self.W_o(x) + self.U_o(h_prev))
        c_tilde_t = torch.tanh(self.W_c(x) + self.U_c(h_prev))
        c_t = f_t * c_prev + i_t * c_tilde_t
        h_t = o_t * torch.tanh(c_t)
        return h_t, c_t

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cells = nn.ModuleList([LSTMCell(input_size if i == 0 else hidden_size, hidden_size) for i in range(num_layers)])

    def forward(self, x, h_0=None, c_0=None):
        batch_size, seq_len, _ = x.size()
        if h_0 is None:
            h_0 = [torch.zeros(batch_size, self.hidden_size, device=x.device) for _ in range(self.num_layers)]
        if c_0 is None:
            c_0 = [torch.zeros(batch_size, self.hidden_size, device=x.device) for _ in range(self.num_layers)]

        h_n = []
        c_n = []
        outputs = []

        for t in range(seq_len):
            x_t = x[:, t, :]
            for layer in range(self.num_layers):
                h_prev = h_0[layer]
                c_prev = c_0[layer]
                h_t, c_t = self.cells[layer](x_t, h_prev, c_prev)
                h_0[layer] = h_t
                c_0[layer] = c_t
                x_t = h_t
            outputs.append(h_t.unsqueeze(1))

        outputs = torch.cat(outputs, dim=1)
        h_n = h_0
        c_n = c_0

        return outputs, (h_n, c_n)

class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers=2):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embed_size)
        self.lstm = LSTM(embed_size, hidden_size, num_layers)
        self.hidden_size = hidden_size
    
    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (h_n, c_n) = self.lstm(embedded)
        return outputs, (h_n, c_n)

class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size, num_layers=2):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, embed_size)
        self.lstm = LSTM(embed_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, x, h_0, c_0):
        embedded = self.embedding(x)
        outputs, (h_n, c_n) = self.lstm(embedded, h_0, c_0)
        logits = self.fc(outputs)
        return logits, (h_n, c_n)
    
    def forward_step(self, y_prev, h, c):
        # y_prev: (batch,)
        y_prev = y_prev.unsqueeze(1)         # (batch, 1)
        embedded = self.embedding(y_prev)    # (batch, 1, embed)
        
        outputs, (h, c) = self.lstm(embedded, h, c)
        logits = self.fc(outputs[:, -1, :])  # l·∫•y token cu·ªëi

        return logits, h, c
    
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, sos_id, eos_id):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.sos_id = sos_id
        self.eos_id = eos_id
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.size()
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, tgt_len, vocab_size, device=src.device)

        # encoder produce h, c
        encoder_outputs, (h, c) = self.encoder(src)

        # start token
        y_prev = torch.full((batch_size,), self.sos_id, device=src.device)

        for t in range(tgt_len):
            probs, h, c = self.decoder.forward_step(y_prev, h, c)
            outputs[:, t, :] = probs

            # teacher forcing
            use_tf = torch.rand(1).item() < teacher_forcing_ratio
            y_prev = tgt[:, t] if use_tf else probs.argmax(dim=-1)

        return outputs



In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # S·ª≠ d·ª•ng GPU ƒë·ªÉ train m√¥ h√¨nh LSTM

encoder = Encoder(input_size=len(vocab_en), embed_size=256, hidden_size=512, num_layers=2)
decoder = Decoder(output_size=len(vocab_fr), embed_size=256, hidden_size=512, num_layers=2)
seq2seq_model = Seq2Seq(encoder, decoder, sos_id=vocab_fr['<sos>'], eos_id=vocab_fr['<eos>']).to(device)

# 3. Model Training

In [17]:
pad_idx = vocab_fr.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = torch.optim.Adam(seq2seq_model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, patience=1, factor=0.5
)


In [18]:
def train_model(model, train_loader, valid_loader, num_epochs=20, teacher_forcing_ratio=0.5):
    best_val_loss = float("inf")
    patience = 0

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0

        for src, tgt, src_lens, tgt_lens in train_loader:
            
            src = src.to(device)
            tgt = tgt.to(device)

            optimizer.zero_grad()

            outputs = model(src, tgt, teacher_forcing_ratio)   # (B, T, V)
            outputs = outputs[:, :-1, :].reshape(-1, outputs.size(-1))
            tgt_gold = tgt[:, 1:].reshape(-1)

            loss = criterion(outputs, tgt_gold)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            train_loss += loss.item()

        # ---- VALIDATION ----
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for src, tgt, _, _ in valid_loader:
                
                src = src.to(device)
                tgt = tgt.to(device)

                outputs = model(src, tgt, 0)
                outputs = outputs[:, :-1, :].reshape(-1, outputs.size(-1))
                tgt_gold = tgt[:, 1:].reshape(-1)

                loss = criterion(outputs, tgt_gold)
                val_loss += loss.item()

        scheduler.step(val_loss)

        print(f"Epoch {epoch+1} | Train: {train_loss:.4f} | Val: {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience = 0
            torch.save(model.state_dict(), "best_model.pt")
            print("  ‚Üí Saved best model")
        else:
            patience += 1
            if patience >= 3:
                print("Early stopping triggered!")
                break

In [19]:
train_model(seq2seq_model, train_loader, valid_loader, num_epochs=20, teacher_forcing_ratio=0.5)

Epoch 1 | Train: 2224.9224 | Val: 74.3829
  ‚Üí Saved best model
Epoch 2 | Train: 1874.9713 | Val: 66.5534
  ‚Üí Saved best model
Epoch 3 | Train: 1684.9115 | Val: 62.3529
  ‚Üí Saved best model
Epoch 4 | Train: 1541.7229 | Val: 59.6397
  ‚Üí Saved best model
Epoch 5 | Train: 1424.7440 | Val: 56.5635
  ‚Üí Saved best model
Epoch 6 | Train: 1317.3354 | Val: 54.7719
  ‚Üí Saved best model
Epoch 7 | Train: 1216.8709 | Val: 53.9466
  ‚Üí Saved best model
Epoch 8 | Train: 1124.5261 | Val: 52.9172
  ‚Üí Saved best model
Epoch 9 | Train: 1046.1415 | Val: 52.8430
  ‚Üí Saved best model
Epoch 10 | Train: 972.2116 | Val: 52.6479
  ‚Üí Saved best model
Epoch 11 | Train: 904.9045 | Val: 51.9730
  ‚Üí Saved best model
Epoch 12 | Train: 844.6209 | Val: 52.4782
Epoch 13 | Train: 782.2043 | Val: 52.7849
Epoch 14 | Train: 679.1109 | Val: 53.1211
Early stopping triggered!


# 4. D·ªãch t·∫≠p test

In [20]:
def translate(sentence, model, vocab_en, vocab_fr, max_len=50):
    model.eval()

    # tokenize
    tokens = list(en_tokenizer(sentence))
    ids = [vocab_en.stoi.get(tok, vocab_en.stoi["<unk>"]) for tok in tokens]
    src = torch.tensor(ids).unsqueeze(0).to(next(model.parameters()).device)

    # encode
    _, (h, c) = model.encoder(src)

    # decode t·ª´ng b∆∞·ªõc
    y_prev = torch.tensor([vocab_fr.stoi["<sos>"]], device=src.device)
    result_ids = []

    for _ in range(max_len):
        probs, h, c = model.decoder.forward_step(y_prev, h, c)
        y_prev = probs.argmax(dim=-1)

        token_id = y_prev.item()
        if token_id == vocab_fr.stoi["<eos>"]:
            break
        
        result_ids.append(token_id)

    # convert id -> word b·∫±ng vocab_fr.itos
    words = [vocab_fr.itos[i] for i in result_ids]
    return " ".join(words)

In [22]:
def decode_tensor(tensor_ids, vocab):
    # vocab.itos l√† list: index -> token
    tokens = []
    for idx in tensor_ids.tolist():
        if idx < len(vocab.itos):
            tokens.append(vocab.itos[idx])
    # b·ªè pad/sos/eos
    tokens = [t for t in tokens if t not in ["<pad>", "<sos>", "<eos>"]]
    return " ".join(tokens)

def show_examples(test_ds, model, vocab_en, vocab_fr, n=5):
    for i in range(n):
        src_tensor, tgt_tensor = test_ds[i]

        eng = decode_tensor(src_tensor, vocab_en)
        fr  = decode_tensor(tgt_tensor, vocab_fr)
        pred = translate(eng, model, vocab_en, vocab_fr)

        print(f"\n[Example {i+1}]")
        print(f"EN:   {eng}")
        print(f"FR:   {fr}")
        print(f"PRED: {pred}")


In [23]:
seq2seq_model.load_state_dict(torch.load("best_model.pt", map_location=device))
seq2seq_model.eval()
show_examples(test_ds, seq2seq_model, vocab_en, vocab_fr, n=5)




[Example 1]
EN:   A man in an orange hat starring at something .
FR:   Un homme avec un chapeau orange regardant quelque chose .
PRED: Un homme avec un chapeau orange √† quelque chose

[Example 2]
EN:   A Boston Terrier is running on lush green grass in front of a white fence .
FR:   Un terrier de Boston court sur l' herbe verdoyante devant une cl√¥ture blanche .
PRED: Un nouveau - de est allong√© sur sur herbe verte sur une verte verte .

[Example 3]
EN:   A girl in karate uniform breaking a stick with a front kick .
FR:   Une fille en tenue de karat√© brisant un b√¢ton avec un coup de pied .
PRED: Une fille en tenue de bain un un un avec un un de de .

[Example 4]
EN:   Five people wearing winter jackets and helmets stand in the snow , with <unk> in the background .
FR:   Cinq personnes avec des vestes d' hiver et des casques sont debout dans la neige , avec des <unk> en arri√®re-plan .
PRED: Cinq personnes portant des casques de des et des des , sur la dans rue , avec des et et des

# 5. ƒê√°nh gi√° m√¥ h√¨nh

In [34]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import torch

def compute_bleu(model, test_ds, vocab_en, vocab_fr, max_samples=500):
    model.eval()
    smoothie = SmoothingFunction().method1

    inv_fr = {i: tok for tok, i in vocab_fr.stoi.items()}

    def tensor_to_sentence(tensor_ids):
        words = []
        for idx in tensor_ids:
            token = inv_fr.get(idx.item(), "<unk>")
            if token in ["<sos>", "<pad>"]:
                continue
            if token == "<eos>":
                break
            words.append(token)
        return words

    total_bleu = 0
    count = min(max_samples, len(test_ds))

    for i in range(count):
        src_tensor, tgt_tensor = test_ds[i]

        # convert tgt_tensor ‚Üí list token words
        tgt_tokens = tensor_to_sentence(tgt_tensor)

        # translate predicted
        inv_en = {i: tok for tok, i in vocab_en.stoi.items()}

        src_words = []
        for tok in src_tensor:
            word = inv_en.get(tok.item(), "<unk>")
            if word in ["<pad>", "<sos>", "<eos>"]:
                continue
            src_words.append(word)

        src_sentence = " ".join(src_words)

        pred_text = translate(src_sentence, model, vocab_en, vocab_fr)

        pred_tokens = pred_text.split()

        # compute BLEU (unigram + bigram)
        bleu = sentence_bleu(
            [tgt_tokens], pred_tokens,
            smoothing_function=smoothie,
            weights=(0.5, 0.5, 0, 0)
        )
        total_bleu += bleu

    bleu_score = total_bleu / count
    print(f"\nüîµ BLEU score = {bleu_score:.4f} on {count} samples")
    return bleu_score


In [35]:
bleu = compute_bleu(seq2seq_model, test_ds, vocab_en, vocab_fr, max_samples=500)



üîµ BLEU score = 0.3610 on 500 samples
