# LSTM English-to-French

## I. Build the vocab before training the model

### Step 1: Prepare the dataset & tokenize the data

In [6]:
from torchtext.data.utils import get_tokenizer

# Tokenizers
en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
fr_tokenizer = get_tokenizer("spacy", language="fr_core_news_sm")

SPECIAL_TOKENS = ["<unk>", "<pad>", "<bos>", "<eos>"]
MAX_VOCAB = 10_004
UNK_TOKEN = "<unk>"

# --- Paths ---
train_file_en = "./Data/train.en"
train_file_fr = "./Data/train.fr"

# --- Function ---
def get_tokens(path, tokenizer):
    all_tokens = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            all_tokens.append(tokenizer(line.strip()))
    return all_tokens

# --- Load tokens ---
enToken = get_tokens(train_file_en, en_tokenizer)
frToken = get_tokens(train_file_fr, fr_tokenizer)


### Step 2: Build the vocabulary

In [7]:
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator

# ==== Build vocab function ====

def yield_tokens(token_list):
    for tokens in token_list:
        yield tokens

# ==== English vocab ====

en_vocab = build_vocab_from_iterator(
    yield_tokens(enToken),
    specials=SPECIAL_TOKENS,
    max_tokens=MAX_VOCAB
)

en_vocab.set_default_index(en_vocab[UNK_TOKEN])

# ==== French vocab ====

fr_vocab = build_vocab_from_iterator(
    yield_tokens(frToken),
    specials=SPECIAL_TOKENS,
    max_tokens=MAX_VOCAB
)

fr_vocab.set_default_index(fr_vocab[UNK_TOKEN])

# ==== Check vocab size ====
print("English vocab size:", len(en_vocab))
print("French vocab size:", len(fr_vocab))
print("Two: ",en_vocab(['Two']))
print("Young: ",en_vocab(['young']))

English vocab size: 10004
French vocab size: 10004
Two:  [19]
Young:  [25]


Finally, we have built two vocabularies for the two languages. Each vocabulary contains the 10,000 most frequent words in the dataset, along with four special tokens: <`unk`>, <`pad`>, <`sos`>, and <`eos`>.

## II. Padding & Tracking

### Step 1: Sync the lenght of batch with pad_sequence and pack padded sequence with collate_fn() function

In [8]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

import torch
from torch.nn.utils.rnn import pad_sequence
def collate_fn(sentences, vocab, pad_token='<pad>'):
    pad_idx = vocab[pad_token]
    batch_indices = [torch.tensor([vocab[token] for token in sentence]) for sentence in sentences]
    
    lengths = torch.tensor([len(seq) for seq in batch_indices])
    
    src_padded = pad_sequence(batch_indices, batch_first=True, padding_value=pad_idx)
    return src_padded, lengths


### Step 2: Initiliazel Dataloader to control for trainning model

In [9]:
from torch.utils.data import DataLoader
from functools import partial

loader = DataLoader(
    enToken,
    batch_size=128,
    shuffle=False,  # QUAN TRỌNG
    collate_fn=partial(collate_fn, vocab=en_vocab)
)

for batch_idx, (padded_batch, lengths) in enumerate(loader):
    print(f"\n=== Batch {batch_idx} ===")
    print("Padded batch shape:", padded_batch.shape)
    # print("Lengths:", lengths)

for batch_idx, (padded_batch, lengths) in enumerate(loader):
    # Đây là lengths gốc của 5 câu trong batch
    lengths_sorted, sorted_idx = torch.sort(lengths, descending=True)
    padded_sorted = padded_batch[sorted_idx]
    
    packed_input = pack_padded_sequence(padded_sorted, lengths_sorted, batch_first=True, enforce_sorted=True)




=== Batch 0 ===
Padded batch shape: torch.Size([128, 22])

=== Batch 1 ===
Padded batch shape: torch.Size([128, 35])

=== Batch 2 ===
Padded batch shape: torch.Size([128, 34])

=== Batch 3 ===
Padded batch shape: torch.Size([128, 26])

=== Batch 4 ===
Padded batch shape: torch.Size([128, 29])

=== Batch 5 ===
Padded batch shape: torch.Size([128, 23])

=== Batch 6 ===
Padded batch shape: torch.Size([128, 25])

=== Batch 7 ===
Padded batch shape: torch.Size([128, 22])

=== Batch 8 ===
Padded batch shape: torch.Size([128, 24])

=== Batch 9 ===
Padded batch shape: torch.Size([128, 26])

=== Batch 10 ===
Padded batch shape: torch.Size([128, 26])

=== Batch 11 ===
Padded batch shape: torch.Size([128, 31])

=== Batch 12 ===
Padded batch shape: torch.Size([128, 33])

=== Batch 13 ===
Padded batch shape: torch.Size([128, 27])

=== Batch 14 ===
Padded batch shape: torch.Size([128, 26])

=== Batch 15 ===
Padded batch shape: torch.Size([128, 27])

=== Batch 16 ===
Padded batch shape: torch.Size([

# III. Build the model LSTM

## Encoder class building

In [None]:
class LSTM():
    def __init__(self, input_size, hidden_size):
        pass
    def sigmoid(x):
        pass
    def tanh(x):
        pass
    def inputGate(x):
        pass
    def stateGate():
        pass
    def forward(self, x, h, c):
        pass

