# LSTM English-to-French

## I. Build the vocab before training the model

### Step 1: Prepare the dataset & tokenize the data

In [None]:
from torchtext.data.utils import get_tokenizer

# Tokenizers
en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
fr_tokenizer = get_tokenizer("spacy", language="fr_core_news_sm")

SPECIAL_TOKENS = ["<unk>", "<pad>", "<bos>", "<eos>"]
MAX_VOCAB = 10_004
UNK_TOKEN = "<unk>"

def get_tokens(path, tokenizer):
    all_tokens = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            all_tokens.append(tokenizer(line.strip()))
    return all_tokens

enToken = get_tokens("./Data/train.en", en_tokenizer)
frToken = get_tokens("./Data/train.fr", fr_tokenizer)

### Step 2: Build the vocabulary

In [14]:
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator

# ==== Build vocab function ====

def yield_tokens(token_list):
    for tokens in token_list:
        yield tokens

# ==== English vocab ====

en_vocab = build_vocab_from_iterator(
    yield_tokens(enToken),
    specials=SPECIAL_TOKENS,
    max_tokens=MAX_VOCAB
)

en_vocab.set_default_index(en_vocab[UNK_TOKEN])

# ==== French vocab ====

fr_vocab = build_vocab_from_iterator(
    yield_tokens(frToken),
    specials=SPECIAL_TOKENS,
    max_tokens=MAX_VOCAB
)

fr_vocab.set_default_index(fr_vocab[UNK_TOKEN])

# ==== Check vocab size ====
print("English vocab size:", len(en_vocab))
print("French vocab size:", len(fr_vocab))


English vocab size: 10004
French vocab size: 10004


Finally, we have built two vocabularies for the two languages. Each vocabulary contains the 10,000 most frequent words in the dataset, along with four special tokens: <`unk`>, <`pad`>, <`sos`>, and <`eos`>.

## II. Padding & Tracking

### Step 1: Sync the lenght of batch with pad_sequence

In [16]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

import torch
from torch.nn.utils.rnn import pad_sequence

def collate_fn(sentences, vocab, pad_token='<pad>'):
    """
    sentences: list các câu (string hoặc list token)
    vocab: torchtext vocab object
    pad_token: token dùng để pad
    """
    pad_idx = vocab[pad_token]
    
    # 1️⃣ Nếu input là string, bạn có thể tokenize ở đây
    # Giả sử sentences đã là list token, nếu là string thì tokenize tùy cách bạn muốn
    # sentences = [sentence.split() for sentence in sentences]
    
    # 2️⃣ Chuyển token sang index
    batch_indices = [torch.tensor([vocab[token] for token in sentence]) 
                     for sentence in sentences]
    
    # 3️⃣ Lấy độ dài trước khi pad
    lengths = [len(seq) for seq in batch_indices]
    
    # 4️⃣ Pad tất cả về cùng chiều dài
    src_padded = pad_sequence(batch_indices, batch_first=True, padding_value=pad_idx)
    
    return src_padded, lengths

# collate_fn(,en_vocab)

In [None]:


# packed = pack_padded_sequence(padded, lengths, batch_first=True, enforce_sorted=False)
