# LSTM English-to-French

## I. Build the vocab before training the model

### Step 1: Prepare the dataset & tokenize the data

In [1]:
from torchtext.data.utils import get_tokenizer

# Tokenizers
en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
fr_tokenizer = get_tokenizer("spacy", language="fr_core_news_sm")

SPECIAL_TOKENS = ["<unk>", "<pad>", "<bos>", "<eos>"]
MAX_VOCAB = 10_004
UNK_TOKEN = "<unk>"

# --- Paths ---
train_file_en = "./Data/train.en"
train_file_fr = "./Data/train.fr"

# --- Function ---
def get_tokens(path, tokenizer):
    all_tokens = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            all_tokens.append(tokenizer(line.strip()))
    return all_tokens

# --- Load tokens ---
enToken = get_tokens(train_file_en, en_tokenizer)
frToken = get_tokens(train_file_fr, fr_tokenizer)




### Step 2: Build the vocabulary

In [2]:
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator

# ==== Build vocab function ====

def yield_tokens(token_list):
    for tokens in token_list:
        yield tokens

# ==== English vocab ====

en_vocab = build_vocab_from_iterator(
    yield_tokens(enToken),
    specials=SPECIAL_TOKENS,
    max_tokens=MAX_VOCAB
)

en_vocab.set_default_index(en_vocab[UNK_TOKEN])

# ==== French vocab ====

fr_vocab = build_vocab_from_iterator(
    yield_tokens(frToken),
    specials=SPECIAL_TOKENS,
    max_tokens=MAX_VOCAB
)

fr_vocab.set_default_index(fr_vocab[UNK_TOKEN])

# ==== Check vocab size ====
print("English vocab size:", len(en_vocab))
print("French vocab size:", len(fr_vocab))
print("Two: ",en_vocab(['Two']))
print("Young: ",en_vocab(['young']))



English vocab size: 10004
French vocab size: 10004
Two:  [19]
Young:  [25]


Finally, we have built two vocabularies for the two languages. Each vocabulary contains the 10,000 most frequent words in the dataset, along with four special tokens: <`unk`>, <`pad`>, <`sos`>, and <`eos`>.

## II. Padding & Tracking

### Step 1: Sync the lenght of batch with pad_sequence and pack padded sequence with collate_fn() function

In [3]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import torch
def collate_fn(batch, src_vocab, tgt_vocab, pad_token='<pad>', sos_token='<sos>', eos_token='<eos>'):
    """
    batch: list of tuples (src_sentence, tgt_sentence)
    src_vocab, tgt_vocab: vocab objects
    """
    src_pad_idx = src_vocab[pad_token]
    tgt_pad_idx = tgt_vocab[pad_token]
    sos_idx = tgt_vocab[sos_token]
    eos_idx = tgt_vocab[eos_token]

    # tách source và target
    src_sentences, tgt_sentences = zip(*batch)

    # chuyển token -> index
    src_indices = [torch.tensor([src_vocab[token] for token in s]) for s in src_sentences]
    # thêm <sos> và <eos> cho target
    tgt_indices = [
        torch.tensor([sos_idx] + [tgt_vocab[token] for token in t] + [eos_idx])
        for t in tgt_sentences
    ]

    # độ dài gốc của source
    src_lengths = torch.tensor([len(seq) for seq in src_indices])

    # pad sequences
    src_padded = pad_sequence(src_indices, batch_first=True, padding_value=src_pad_idx)
    tgt_padded = pad_sequence(tgt_indices, batch_first=True, padding_value=tgt_pad_idx)

    return src_padded, src_lengths, tgt_padded


### Step 2: Initiliazel Dataloader to control for trainning model

In [4]:
from torch.utils.data import DataLoader
from functools import partial
dataset = list(zip(enToken, frToken))  # enToken = source, frToken = target


loader = DataLoader(
    dataset,  # dataset là list zip(enToken, frToken)
    batch_size=128,
    shuffle=False,
    collate_fn=partial(collate_fn, src_vocab=en_vocab, tgt_vocab=fr_vocab)
)


# III. Build the model LSTM

## Encoder class building

In [5]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=en_vocab["<pad>"])
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)

    def forward(self, src_batch, src_lengths):
        # src_batch: (batch, seq_len)
        embedded = self.embedding(src_batch)
        # pack sequences
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, src_lengths.cpu(), batch_first=True, enforce_sorted=True
        )
        packed_out, (h, c) = self.lstm(packed)
        # nếu cần output unpacked:
        # out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        return h, c


In [6]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, teacher_forcing_ratio=1.0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=fr_vocab["<pad>"])
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.teacher_forcing_ratio = teacher_forcing_ratio  # 0.0 ~ 1.0

    def forward(self, target_ids, h0, c0):
        """
        target_ids: (batch, seq_len)
        h0, c0: hidden state from encoder
        """
        batch_size, seq_len = target_ids.size()
        vocab_size = self.fc.out_features

        # Tensor để lưu logits step-by-step
        outputs = torch.zeros(batch_size, seq_len, vocab_size, device=target_ids.device)

        # Input đầu tiên: <bos>
        input_token = target_ids[:, 0]  # giả sử target đã có <bos> ở đầu

        h, c = h0, c0

        for t in range(1, seq_len):
            embedded = self.embedding(input_token).unsqueeze(1)  # (batch, 1, embed_dim)
            out, (h, c) = self.lstm(embedded, (h, c))  # out: (batch, 1, hidden_size)
            logits = self.fc(out.squeeze(1))  # (batch, vocab_size)
            outputs[:, t, :] = logits

            # teacher forcing: dùng token thật làm input cho step tiếp theo
            use_teacher_forcing = True if torch.rand(1).item() < self.teacher_forcing_ratio else False
            input_token = target_ids[:, t] if use_teacher_forcing else logits.argmax(1)

        return outputs  # (batch, seq_len, vocab_size)


In [7]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src_batch, src_lengths, tgt_batch):
        h, c = self.encoder(src_batch, src_lengths)
        logits = self.decoder(tgt_batch, h, c)
        return logits
    
device = "cuda" if torch.cuda.is_available() else "cpu"

encoder = Encoder(vocab_size=len(en_vocab), embed_dim=256, hidden_size=512).to(device)
decoder = Decoder(vocab_size=len(fr_vocab), embed_dim=256, hidden_size=512).to(device)

model = Seq2Seq(encoder, decoder).to(device)

pad_idx = fr_vocab["<pad>"]  # hoặc fr_vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
patience = 3
counter = 0
best_loss = float('inf')

for epoch in range(20):
    model.train()
    total_loss = 0

    for src_batch, src_lengths, tgt_batch in loader:
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)
        src_lengths = src_lengths.to(device)

        lengths_sorted, sorted_idx = torch.sort(src_lengths, descending=True)
        src_sorted = src_batch[sorted_idx]
        tgt_sorted = tgt_batch[sorted_idx]

        optimizer.zero_grad()
        logits = model(src_sorted, lengths_sorted, tgt_sorted)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_sorted.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss = {total_loss:.4f}")

    # save best model
    if total_loss < best_loss:
        best_loss = total_loss
        torch.save(model.state_dict(), "best_seq2seq.pt")
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

    # checkpoint mỗi epoch
    torch.save(model.state_dict(), f"seq2seq_epoch{epoch+1}.pt")



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\Lenovo\anaconda3\envs\nlp\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\Lenovo\anaconda3\envs\nlp\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\Lenovo\anaconda3\envs\nlp\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\Lenovo\anaconda3\envs\nlp\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  Fil

KeyboardInterrupt: 

In [None]:
# tạo model và load weights
encoder = Encoder(vocab_size=len(en_vocab), embed_dim=256, hidden_size=512).to(device)
decoder = Decoder(vocab_size=len(fr_vocab), embed_dim=256, hidden_size=512).to(device)
model = Seq2Seq(encoder, decoder).to(device)

# load best model
model.load_state_dict(torch.load("best_seq2seq.pt", map_location=device))
model.eval()  # chuyển sang evaluation mode
def translate(sentence: str) -> str:
    model.eval()  # chế độ inference

    # 1️⃣ Tokenize câu tiếng Anh (giả sử bạn có tokenizer)
    # Ví dụ đơn giản: split theo space
    tokens = sentence.strip().split()

    # 2️⃣ Chuyển tokens -> indices
    stoi = en_vocab.get_stoi()
    src_indices = [stoi[token] if token in stoi else stoi["<unk>"] for token in tokens]

    # 3️⃣ Chuyển sang tensor, batch_size=1
    src_tensor = torch.tensor([src_indices], dtype=torch.long).to(device)
    src_lengths = torch.tensor([len(src_indices)]).to(device)

    # 4️⃣ Encoder
    with torch.no_grad():
        h, c = model.encoder(src_tensor, src_lengths)

        # 5️⃣ Greedy decoding với <sos> làm token khởi đầu
        sos_idx = fr_vocab["<sos>"]
        eos_idx = fr_vocab["<eos>"]

        output_indices = [sos_idx]
        input_idx = torch.tensor([[sos_idx]], dtype=torch.long).to(device)

        for _ in range(50):  # max length
            logits = model.decoder(input_idx, h, c)  # shape (1, 1, vocab_size)
            next_token = logits.argmax(-1)[:, -1].item()
            if next_token == eos_idx:
                break
            output_indices.append(next_token)
            input_idx = torch.tensor([[next_token]], dtype=torch.long).to(device)

    # 6️⃣ Convert indices -> tokens
    itos = {v:k for k,v in fr_vocab.get_stoi().items()}
    output_tokens = [itos[idx] for idx in output_indices[1:]]  # bỏ <sos>

    # 7️⃣ Join thành câu tiếng Pháp
    translated_french_sentence = " ".join(output_tokens)
    return translated_french_sentence

# Ví dụ sử dụng:
sentence_en = "Two"
print(translate(sentence_en))


<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>
