In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import Dataset, DataLoader
import spacy
import os
import random
from collections import Counter
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


In [6]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [None]:
# !pip install torch


# ============================================================================
# 1. CHU·∫®N B·ªä D·ªÆ LI·ªÜU
# ============================================================================

In [None]:
class Vocabulary:
    """X√¢y d·ª±ng t·ª´ ƒëi·ªÉn cho m·ªôt ng√¥n ng·ªØ"""
    def __init__(self, max_vocab_size=10000):
        self.max_vocab_size = max_vocab_size
        self.word2idx = {}
        self.idx2word = {}
        self.word_freq = Counter()

        # Token ƒë·∫∑c bi·ªát
        self.PAD_TOKEN = '<pad>'
        self.UNK_TOKEN = '<unk>'
        self.SOS_TOKEN = '<sos>'
        self.EOS_TOKEN = '<eos>'

        self.pad_idx = 0
        self.unk_idx = 1
        self.sos_idx = 2
        self.eos_idx = 3

    def build_vocab(self, sentences):
        """X√¢y d·ª±ng t·ª´ ƒëi·ªÉn t·ª´ danh s√°ch c√¢u"""
        # ƒê·∫øm t·∫ßn su·∫•t t·ª´
        for sentence in sentences:
            self.word_freq.update(sentence)

        # L·∫•y top t·ª´ ph·ªï bi·∫øn
        most_common = self.word_freq.most_common(self.max_vocab_size - 4)

        # Th√™m token ƒë·∫∑c bi·ªát
        self.word2idx = {
            self.PAD_TOKEN: self.pad_idx,
            self.UNK_TOKEN: self.unk_idx,
            self.SOS_TOKEN: self.sos_idx,
            self.EOS_TOKEN: self.eos_idx
        }

        # Th√™m c√°c t·ª´ ph·ªï bi·∫øn
        for idx, (word, freq) in enumerate(most_common, start=4):
            self.word2idx[word] = idx

        # T·∫°o √°nh x·∫° ng∆∞·ª£c
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

    def encode(self, sentence):
        """Chuy·ªÉn c√¢u th√†nh list index"""
        return [self.word2idx.get(word, self.unk_idx) for word in sentence]

    def decode(self, indices):
        """Chuy·ªÉn list index th√†nh c√¢u"""
        return [self.idx2word.get(idx, self.UNK_TOKEN) for idx in indices]

    def __len__(self):
        return len(self.word2idx)


class TranslationDataset(Dataset):
    """Dataset cho d·ªãch m√°y"""
    def __init__(self, src_sentences, tgt_sentences, src_vocab, tgt_vocab):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src = self.src_sentences[idx]
        tgt = self.tgt_sentences[idx]

        # Encode th√†nh indices
        src_indices = self.src_vocab.encode(src) + [self.src_vocab.eos_idx]
        tgt_indices = [self.tgt_vocab.sos_idx] + self.tgt_vocab.encode(tgt) + [self.tgt_vocab.eos_idx]

        return torch.LongTensor(src_indices), torch.LongTensor(tgt_indices)


def collate_fn(batch):
    """Collate function ƒë·ªÉ x·ª≠ l√Ω padding v√† sorting"""
    src_batch, tgt_batch = zip(*batch)

    # S·∫Øp x·∫øp theo ƒë·ªô d√†i gi·∫£m d·∫ßn (b·∫Øt bu·ªôc cho pack_padded_sequence)
    src_lengths = torch.LongTensor([len(s) for s in src_batch])
    sorted_indices = src_lengths.argsort(descending=True)

    src_batch = [src_batch[i] for i in sorted_indices]
    tgt_batch = [tgt_batch[i] for i in sorted_indices]
    src_lengths = src_lengths[sorted_indices]

    # Padding
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=0)

    return src_padded, src_lengths, tgt_padded

# ============================================================================
# 2. M√î H√åNH ENCODER-DECODER
# ============================================================================

In [None]:


class Encoder(nn.Module):
    """LSTM Encoder"""
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers,
                           batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_lengths):
        """
        Args:
            src: (batch_size, src_len)
            src_lengths: (batch_size,)
        Returns:
            hidden: (num_layers, batch_size, hidden_size)
            cell: (num_layers, batch_size, hidden_size)
        """
        embedded = self.dropout(self.embedding(src))  # (batch, src_len, emb_dim)

        # Pack sequence
        packed = pack_padded_sequence(embedded, src_lengths.cpu(), batch_first=True, enforce_sorted=True)

        # LSTM
        packed_output, (hidden, cell) = self.lstm(packed)

        # hidden: (num_layers, batch, hidden_size)
        # cell: (num_layers, batch, hidden_size)
        return hidden, cell


class Decoder(nn.Module):
    """LSTM Decoder"""
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers,
                           batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc_out = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token, hidden, cell):
        """
        Args:
            input_token: (batch_size, 1)
            hidden: (num_layers, batch_size, hidden_size)
            cell: (num_layers, batch_size, hidden_size)
        Returns:
            prediction: (batch_size, vocab_size)
            hidden: (num_layers, batch_size, hidden_size)
            cell: (num_layers, batch_size, hidden_size)
        """
        embedded = self.dropout(self.embedding(input_token))  # (batch, 1, emb_dim)

        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        # output: (batch, 1, hidden_size)

        prediction = self.fc_out(output.squeeze(1))  # (batch, vocab_size)

        return prediction, hidden, cell


# Thay th·∫ø class Seq2Seq.forward b·∫±ng ƒëo·∫°n n√†y
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, src_lengths, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.vocab_size

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)

        # Encoder
        hidden, cell = self.encoder(src, src_lengths)

        # N·∫øu d√πng full teacher forcing -> vectorized decoding (1 pass LSTM)
        if teacher_forcing_ratio >= 1.0 - 1e-9:
            # Input to decoder LSTM are the target tokens excluding the last token
            # (we use targets as inputs when teacher forcing)
            # tgt_in: shape (batch, tgt_len-1)
            tgt_in = tgt[:, :-1]  # exclude final <eos> if present
            embedded = self.decoder.embedding(tgt_in)  # (batch, seq_len, emb_dim)
            embedded = self.decoder.dropout(embedded)

            # Run decoder LSTM once for the whole sequence
            decoder_outputs, (hidden, cell) = self.decoder.lstm(embedded, (hidden, cell))
            # decoder_outputs: (batch, seq_len, hidden_size)
            # Map to vocab
            pred = self.decoder.fc_out(decoder_outputs)  # (batch, seq_len, vocab_size)

            # Place predictions into outputs aligned so that outputs[:, t, :] predicts token at tgt[:, t]
            outputs[:, 1:tgt_len, :] = pred  # note: pred corresponds to steps 1..tgt_len-1
            # (outputs[:,0,:] stays zeros or you can set it to prediction for <sos> if you want)
            return outputs

        # Fallback: mixed or no teacher forcing -> step-by-step (c≈©)
        decoder_input = tgt[:, 0].unsqueeze(1)  # <sos>
        for t in range(1, tgt_len):
            prediction, hidden, cell = self.decoder(decoder_input, hidden, cell)
            outputs[:, t, :] = prediction
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = prediction.argmax(1).unsqueeze(1)
            decoder_input = tgt[:, t].unsqueeze(1) if teacher_force else top1

        return outputs


# ============================================================================
# 3. HU·∫§N LUY·ªÜN
# ============================================================================


In [7]:
def train_epoch(model, dataloader, optimizer, criterion, clip, device):
    """Hu·∫•n luy·ªán 1 epoch"""
    model.train()
    epoch_loss = 0

    for src, src_lengths, tgt in dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        optimizer.zero_grad()

        # Forward
        output = model(src, src_lengths, tgt, teacher_forcing_ratio=0.5)

        # output: (batch, tgt_len, vocab_size)
        # tgt: (batch, tgt_len)

        # B·ªè <sos> token v√† flatten
        output = output[:, 1:, :].reshape(-1, output.shape[-1])
        tgt = tgt[:, 1:].reshape(-1)

        # T√≠nh loss
        loss = criterion(output, tgt)

        # Backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


def evaluate(model, dataloader, criterion, device):
    """ƒê√°nh gi√° tr√™n t·∫≠p validation"""
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, src_lengths, tgt in dataloader:
            src = src.to(device)
            tgt = tgt.to(device)

            # Forward (kh√¥ng teacher forcing)
            output = model(src, src_lengths, tgt, teacher_forcing_ratio=0)

            output = output[:, 1:, :].reshape(-1, output.shape[-1])
            tgt = tgt[:, 1:].reshape(-1)

            loss = criterion(output, tgt)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

# ============================================================================
# 4. D·ª∞ ƒêO√ÅN (INFERENCE)
# ============================================================================


In [8]:
def translate_sentence(model, sentence, src_vocab, tgt_vocab, src_tokenizer, device, max_len=50):
    """
    D·ªãch m·ªôt c√¢u t·ª´ ti·∫øng Anh sang ti·∫øng Ph√°p

    Args:
        model: m√¥ h√¨nh ƒë√£ train
        sentence: c√¢u ti·∫øng Anh (string)
        src_vocab, tgt_vocab: t·ª´ ƒëi·ªÉn
        src_tokenizer: tokenizer cho ti·∫øng Anh
        device: cuda/cpu
        max_len: ƒë·ªô d√†i t·ªëi ƒëa c√¢u d·ªãch

    Returns:
        translated_sentence: c√¢u ti·∫øng Ph√°p (string)
    """
    model.eval()

    # Tokenize
    tokens = src_tokenizer(sentence.lower())
    tokens = [token.text for token in tokens]

    # Encode
    indices = src_vocab.encode(tokens) + [src_vocab.eos_idx]
    src_tensor = torch.LongTensor(indices).unsqueeze(0).to(device)  # (1, src_len)
    src_lengths = torch.LongTensor([len(indices)])

    with torch.no_grad():
        # Encoder
        hidden, cell = model.encoder(src_tensor, src_lengths)

        # Decoder
        decoder_input = torch.LongTensor([tgt_vocab.sos_idx]).unsqueeze(0).to(device)
        translated_indices = []

        for _ in range(max_len):
            prediction, hidden, cell = model.decoder(decoder_input, hidden, cell)
            predicted_token = prediction.argmax(1).item()

            if predicted_token == tgt_vocab.eos_idx:
                break

            translated_indices.append(predicted_token)
            decoder_input = torch.LongTensor([predicted_token]).unsqueeze(0).to(device)

    # Decode
    translated_tokens = tgt_vocab.decode(translated_indices)
    translated_sentence = ' '.join(translated_tokens)

    return translated_sentence


def calculate_bleu(model, test_data, src_vocab, tgt_vocab, src_tokenizer, device):
    """T√≠nh BLEU score tr√™n t·∫≠p test"""
    bleu_scores = []
    smoothing = SmoothingFunction().method1

    for src_sent, tgt_sent in test_data:
        src_text = ' '.join(src_sent)
        translated = translate_sentence(model, src_text, src_vocab, tgt_vocab, src_tokenizer, device)

        reference = [tgt_sent]
        candidate = translated.split()

        score = sentence_bleu(reference, candidate, smoothing_function=smoothing)
        bleu_scores.append(score)

    return np.mean(bleu_scores)


In [None]:
# ---------------------
# üìÅ ƒê∆∞·ªùng d·∫´n file d·ªØ li·ªáu
# ---------------------
en_path = r"D:\HK7NAM4\LTSM\English_French_Machine_Translation_LSTM\dataset\fr-en\europarl-v7.fr-en.en"
fr_path = r"D:\HK7NAM4\LTSM\English_French_Machine_Translation_LSTM\dataset\fr-en\europarl-v7.fr-en.fr"

# ---------------------
# üìÑ H√†m ƒë·ªçc file
# ---------------------
def load_lines(path):
    lines = []
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            if line:
                lines.append(line)
    return lines

# ---------------------
# üîπ ƒê·ªçc d·ªØ li·ªáu g·ªëc
# ---------------------
print("üîπ ƒêang ƒë·ªçc d·ªØ li·ªáu g·ªëc...")
en_lines = load_lines(en_path)
fr_lines = load_lines(fr_path)
print(f"T·ªïng s·ªë c√¢u song ng·ªØ: {len(en_lines):,}")

üîπ ƒêang ƒë·ªçc d·ªØ li·ªáu g·ªëc...
T·ªïng s·ªë c√¢u song ng·ªØ: 2,005,688


1Ô∏è‚É£ D√≤ng tr·ªëng ho·∫∑c whitespace

line.strip() lo·∫°i b·ªè kho·∫£ng tr·∫Øng, v√† if line: b·ªè c√°c d√≤ng r·ªóng.

N·∫øu file dataset c√≥ nhi·ªÅu d√≤ng tr·ªëng, ch√∫ng s·∫Ω b·ªã lo·∫°i b·ªè.

2Ô∏è‚É£ D√≤ng kh√¥ng kh·ªõp gi·ªØa 2 file

File EN v√† file FR ph·∫£i c√πng s·ªë d√≤ng, m·ªói d√≤ng t∆∞∆°ng ·ª©ng l√† c·∫∑p c√¢u song ng·ªØ.

N·∫øu m·ªôt file c√≥ √≠t d√≤ng h∆°n, b·∫°n ph·∫£i c·∫Øt b·∫±ng nhau b·∫±ng min(len(en_lines), len(fr_lines)).

Nh·ªØng d√≤ng th·ª´a ·ªü file d√†i h∆°n s·∫Ω b·ªã lo·∫°i b·ªè.

3Ô∏è‚É£ L·ªói encoding / k√Ω t·ª± ƒë·∫∑c bi·ªát

B·∫°n d√πng errors='ignore', nghƒ©a l√† nh·ªØng d√≤ng c√≥ k√Ω t·ª± kh√¥ng ƒë·ªçc ƒë∆∞·ª£c s·∫Ω b·ªã b·ªè.

Europarl c√≥ nhi·ªÅu k√Ω t·ª± ƒë·∫∑c bi·ªát ho·∫∑c m√£ l·ªói, n√™n nhi·ªÅu d√≤ng b·ªã m·∫•t.

4Ô∏è‚É£ Ch·ªâ gi·ªØ nh·ªØng d√≤ng kh√¥ng r·ªóng v√† h·ª£p l·ªá

Khi train_test_split ch·∫°y, ch·ªâ nh·ªØng c·∫∑p d√≤ng t·ªìn t·∫°i ·ªü c·∫£ 2 file m·ªõi ƒë∆∞·ª£c d√πng ‚Üí s·ªë l∆∞·ª£ng gi·∫£m m·∫°nh.

In [17]:
# Hyperparameters
EMBEDDING_DIM = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 2
DROPOUT = 0.5
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 10
CLIP = 1
PATIENCE = 3  # cho early stopping

In [None]:
# Ch·ªçn s·ªë c√¢u t·ªëi ƒëa b·∫±ng ƒë·ªô d√†i nh·ªè h∆°n
min_len = min(len(en_lines), len(fr_lines))
en_lines = en_lines[:min_len]
fr_lines = fr_lines[:min_len]

print(f"S·ªë c√¢u sau khi ƒë·ªìng b·ªô: {len(en_lines):,}")


NameError: name 'en_lines' is not defined

In [None]:
# ---------------------
# ‚úÇÔ∏è Chia train / val / test
# ---------------------
from sklearn.model_selection import train_test_split

train_en, temp_en, train_fr, temp_fr = train_test_split(
    en_lines, fr_lines, test_size=0.1, random_state=42
)
val_en, test_en, val_fr, test_fr = train_test_split(
    temp_en, temp_fr, test_size=0.5, random_state=42
)

print(f"Train set: {len(train_en):,} c√¢u")
print(f"Validation set: {len(val_en):,} c√¢u")
print(f"Test set: {len(test_en):,} c√¢u")

Train set: 1,804,311 c√¢u
Validation set: 100,240 c√¢u
Test set: 100,240 c√¢u


In [None]:
# ---------------------
# üî† Tokenizer (SpaCy)
# ---------------------
import spacy

try:
    en_tokenizer = spacy.load("en_core_web_sm", disable=["parser", "ner", "tagger"])
except Exception:
    print("‚ö†Ô∏è en_core_web_sm not found ‚Üí d√πng spacy.blank('en')")
    en_tokenizer = spacy.blank("en")

try:
    fr_tokenizer = spacy.load("fr_core_news_sm", disable=["parser", "ner", "tagger"])
except Exception:
    print("‚ö†Ô∏è fr_core_news_sm not found ‚Üí d√πng spacy.blank('fr')")
    fr_tokenizer = spacy.blank("fr")

print("‚úÖ Tokenizers loaded successfully!")


‚ö†Ô∏è en_core_web_sm not found ‚Üí d√πng spacy.blank('en')
‚ö†Ô∏è fr_core_news_sm not found ‚Üí d√πng spacy.blank('fr')
‚úÖ Tokenizers loaded successfully!


In [None]:
from tqdm import tqdm

def tokenize_lines_fast(lines, tokenizer, n_process=4, batch_size=1000):
    """
    Tokenize nhanh b·∫±ng spaCy.pipe (ƒëa lu·ªìng, √≠t t·ªën RAM)
    """
    tokenized = []
    for doc in tqdm(
        tokenizer.pipe(lines, n_process=n_process, batch_size=batch_size),
        total=len(lines),
        desc="üîπ Tokenizing"
    ):
        toks = [t.text.lower() for t in doc if t.text.strip()]
        tokenized.append(toks)
    return tokenized

print("‚úÖ Defined fast tokenization function.")


‚úÖ Defined fast tokenization function.


In [None]:
print("üîπ Tokenizing train set...")
train_en_tok = tokenize_lines_fast(train_en, en_tokenizer)
train_fr_tok = tokenize_lines_fast(train_fr, fr_tokenizer)

print("üîπ Tokenizing validation set...")
val_en_tok = tokenize_lines_fast(val_en, en_tokenizer)
val_fr_tok = tokenize_lines_fast(val_fr, fr_tokenizer)

print("Tokenizing test set...")
test_en_tok = tokenize_lines_fast(test_en, en_tokenizer)
test_fr_tok = tokenize_lines_fast(test_fr, fr_tokenizer)

print("Done tokenizing all datasets!")


üîπ Tokenizing train set...


üîπ Tokenizing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1804311/1804311 [19:25<00:00, 1548.24it/s]
üîπ Tokenizing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1804311/1804311 [23:45<00:00, 1266.00it/s]


üîπ Tokenizing validation set...


üîπ Tokenizing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100240/100240 [01:29<00:00, 1122.63it/s]
üîπ Tokenizing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100240/100240 [01:47<00:00, 929.23it/s] 


Tokenizing test set...


üîπ Tokenizing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100240/100240 [01:28<00:00, 1127.70it/s]
üîπ Tokenizing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100240/100240 [01:43<00:00, 972.57it/s] 

Done tokenizing all datasets!





In [None]:
import pickle

with open("tokenized_data.pkl", "wb") as f:
    pickle.dump({
        "train_en_tok": train_en_tok,
        "train_fr_tok": train_fr_tok,
        "val_en_tok": val_en_tok,
        "val_fr_tok": val_fr_tok,
        "test_en_tok": test_en_tok,
        "test_fr_tok": test_fr_tok
    }, f)

print("‚úÖ ƒê√£ l∆∞u tokenized dataset th√†nh tokenized_data.pkl")

‚úÖ ƒê√£ l∆∞u tokenized dataset th√†nh tokenized_data.pkl


In [10]:
import pickle

with open("D:/HK7NAM4/LTSM/tokenized_data.pkl", "rb") as f:
    data = pickle.load(f)

train_en_tok = data["train_en_tok"]
train_fr_tok = data["train_fr_tok"]
val_en_tok = data["val_en_tok"]
val_fr_tok = data["val_fr_tok"]
test_en_tok = data["test_en_tok"]
test_fr_tok = data["test_fr_tok"]

print("‚úÖ Loaded tokenized dataset from Drive!")


‚úÖ Loaded tokenized dataset from Drive!


In [18]:
# --------------------------
# üìâ Subset ƒë·ªÉ train nhanh h∆°n
# --------------------------
subset_size = 50_000  # ho·∫∑c 300_000 n·∫øu GPU b·∫°n ·ªïn

train_en_tok = train_en_tok[:subset_size]
train_fr_tok = train_fr_tok[:subset_size]

# (val v√† test c√≥ th·ªÉ gi·ªØ nguy√™n, v√¨ ch√∫ng nh·ªè)
print(f"‚úÖ ƒê√£ r√∫t g·ªçn t·∫≠p train xu·ªëng {len(train_en_tok):,} c√¢u.")


‚úÖ ƒê√£ r√∫t g·ªçn t·∫≠p train xu·ªëng 50,000 c√¢u.


In [19]:
print("Building vocabularies...")

src_vocab = Vocabulary(max_vocab_size=10000)
src_vocab.build_vocab(train_en_tok)

tgt_vocab = Vocabulary(max_vocab_size=10000)
tgt_vocab.build_vocab(train_fr_tok)

print(f"English vocab size: {len(src_vocab)}")
print(f"French vocab size: {len(tgt_vocab)}")


Building vocabularies...
English vocab size: 10000
French vocab size: 10000


In [20]:
from torch.utils.data import DataLoader
from tqdm import tqdm

print("üîπ Creating datasets...")

train_dataset = TranslationDataset(train_en_tok, train_fr_tok, src_vocab, tgt_vocab)
val_dataset = TranslationDataset(val_en_tok, val_fr_tok, src_vocab, tgt_vocab)
test_data = list(zip(test_en_tok, test_fr_tok))  # cho BLEU

# hi·ªÉn th·ªã qu√° tr√¨nh kh·ªüi t·∫°o dataloader
print("üîπ Building dataloaders...")
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"‚úÖ Train examples: {len(train_dataset):,}")
print(f"‚úÖ Val examples: {len(val_dataset):,}")
print(f"‚úÖ Test examples: {len(test_data):,}")

print("üîπ Example tokenized (src ‚Üí tgt):")
print(train_en_tok[0][:15], "‚Üí", train_fr_tok[0][:15])


üîπ Creating datasets...
üîπ Building dataloaders...
‚úÖ Train examples: 50,000
‚úÖ Val examples: 100,240
‚úÖ Test examples: 100,240
üîπ Example tokenized (src ‚Üí tgt):
['i', 'read', 'them', 'both', 'with', 'great', 'interest', '.'] ‚Üí ['je', 'suis', 'fondamentalement', 'r√©ceptif', '√†', 'toutes', 'les', 'tendances', 'lib√©ralisatrices', 'qui', 'rendent', 'possibles', 'de', 'nouvelles', 'formes']


In [21]:
encoder = Encoder(len(src_vocab), EMBEDDING_DIM, HIDDEN_SIZE, NUM_LAYERS, DROPOUT)
decoder = Decoder(len(tgt_vocab), EMBEDDING_DIM, HIDDEN_SIZE, NUM_LAYERS, DROPOUT)
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.pad_idx)

print("‚úÖ Model initialized and ready for training.")


‚úÖ Model initialized and ready for training.


In [22]:
from tqdm import tqdm

def train_epoch(model, dataloader, optimizer, criterion, clip, device):
    model.train()
    epoch_loss = 0

    # leave=True ƒë·ªÉ thanh ti·∫øn tr√¨nh v·∫´n hi·ªán sau khi xong
    loop = tqdm(dataloader, desc="üß† Training", leave=True)
    for src, src_lengths, tgt in loop:  # Unpack src_lengths
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()

        # Forward (c√≥ teacher forcing)
        output = model(src, src_lengths, tgt, teacher_forcing_ratio=0.5)

        # output: (batch, tgt_len, vocab_size)
        # tgt: (batch, tgt_len)
        output = output[:, 1:, :].reshape(-1, output.shape[-1])
        tgt = tgt[:, 1:].reshape(-1)

        # Loss
        loss = criterion(output, tgt)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        loop.set_postfix(loss=f"{loss.item():.3f}")

    return epoch_loss / len(dataloader)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        loop = tqdm(dataloader, desc="üîç Evaluating", leave=True)
        for src, src_lengths, tgt in loop:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, src_lengths, tgt, teacher_forcing_ratio=0)
            output = output[:, 1:, :].reshape(-1, output.shape[-1])
            tgt = tgt[:, 1:].reshape(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
            loop.set_postfix(loss=f"{loss.item():.3f}")
    return epoch_loss / len(dataloader)


# === MAIN TRAIN LOOP ===
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(NUM_EPOCHS):
    print(f"\nüåç Epoch {epoch+1}/{NUM_EPOCHS}")
    train_loss = train_epoch(model, train_loader, optimizer, criterion, CLIP, device)
    val_loss = evaluate(model, val_loader, criterion, device)

    # ‚úÖ K·∫øt qu·∫£ g·ªçn ƒë·∫πp gi·ªëng code c≈©
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}")

    # L∆∞u best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
        print("üíæ ‚Üí Best model saved!")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print("‚õî Early stopping!")
            break



üåç Epoch 1/10


üß† Training:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 417/782 [1:24:25<1:13:53, 12.15s/it, loss=5.681]


KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load('best_model.pth'))
bleu_score = calculate_bleu(model, test_data, src_vocab, tgt_vocab, en_tokenizer, device)
print(f"‚úÖ BLEU Score on test set: {bleu_score:.4f}")


# ============================================================================
# 5. MAIN - S·ª¨ D·ª§NG
# ============================================================================

In [None]:
# # Removed the main() function definition and moved the code outside
# # This makes the variables (model, vocabularies, tokenizer, device, etc.) available globally.

# # C·∫•u h√¨nh
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # Hyperparameters
# EMBEDDING_DIM = 256
# HIDDEN_SIZE = 512
# NUM_LAYERS = 2
# DROPOUT = 0.5
# LEARNING_RATE = 0.001
# BATCH_SIZE = 64
# NUM_EPOCHS = 30
# CLIP = 1

# # -- ƒê∆∞·ªùng d·∫´n t·ªõi th∆∞ m·ª•c d·ªØ li·ªáu (t∆∞∆°ng ƒë·ªëi v·ªõi notebook) --
# data_dir = '/content/drive/MyDrive/Dataset'

# def load_lines(path):
#     """ƒê·ªçc file v√† tr·∫£ v·ªÅ danh s√°ch d√≤ng (kh√¥ng tokenized)"""
#     lines = []
#     with open(path, 'r', encoding='utf-8', errors='ignore') as f:
#         for line in f:
#             line = line.strip()
#             if line:
#                 lines.append(line)
#     return lines

# def tokenize_lines(lines, tokenizer):
#     """Tokenize danh s√°ch c√¢u tr·∫£ v·ªÅ list of token lists"""
#     tokenized = []
#     for line in lines:
#         toks = [t.text for t in tokenizer(line.lower()) if t.text.strip()]
#         tokenized.append(toks)
#     return tokenized

# # T·∫≠p tin theo c·∫•u tr√∫c workspace: dataset/<split>/(file)
# train_en_path = os.path.join(data_dir, 'train.en')
# train_fr_path = os.path.join(data_dir, 'train.fr')
# val_en_path = os.path.join(data_dir, 'val.en')
# val_fr_path = os.path.join(data_dir, 'val.fr')
# test_en_path = os.path.join(data_dir, 'test_2018_flickr.en')
# test_fr_path = os.path.join(data_dir, 'test_2018_flickr.fr')


# # Load raw lines
# train_en_lines = load_lines(train_en_path)
# train_fr_lines = load_lines(train_fr_path)
# val_en_lines = load_lines(val_en_path)
# val_fr_lines = load_lines(val_fr_path)
# test_en_lines = load_lines(test_en_path)
# test_fr_lines = load_lines(test_fr_path)

# # Tokenizers: th·ª≠ load spacy models, fallback sang spacy.blank n·∫øu model ch∆∞a c√†i
# try:
#     en_tokenizer = spacy.load('en_core_web_sm')
# except Exception:
#     # N·∫øu model en_core_web_sm ch∆∞a c√†i, d√πng blank tokenizer (ƒë∆°n gi·∫£n)
#     print("Warning: en_core_web_sm not found. Using spacy.blank('en'). Please install with !python -m spacy download en_core_web_sm")
#     en_tokenizer = spacy.blank('en')

# try:
#     fr_tokenizer = spacy.load('fr_core_news_sm')
# except Exception:
#     print("Warning: fr_core_news_sm not found. Using spacy.blank('fr'). Please install with !python -m spacy download fr_core_news_sm")
#     fr_tokenizer = spacy.blank('fr')


# # Tokenize t·∫•t c·∫£
# print('Tokenizing...')
# train_en_tok = tokenize_lines(train_en_lines, en_tokenizer)
# train_fr_tok = tokenize_lines(train_fr_lines, fr_tokenizer)
# val_en_tok = tokenize_lines(val_en_lines, en_tokenizer)
# val_fr_tok = tokenize_lines(val_fr_lines, fr_tokenizer)
# test_en_tok = tokenize_lines(test_en_lines, en_tokenizer)
# test_fr_tok = tokenize_lines(test_fr_lines, fr_tokenizer)

# # Build vocabularies (t·ª´ train set)
# print('Building vocabularies...')
# src_vocab = Vocabulary(max_vocab_size=10000)
# src_vocab.build_vocab(train_en_tok)
# tgt_vocab = Vocabulary(max_vocab_size=10000)
# tgt_vocab.build_vocab(train_fr_tok)

# # T·∫°o Dataset v√† DataLoader
# train_dataset = TranslationDataset(train_en_tok, train_fr_tok, src_vocab, tgt_vocab)
# val_dataset = TranslationDataset(val_en_tok, val_fr_tok, src_vocab, tgt_vocab)

# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
# val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# # test_data d√πng cho calculate_bleu (d·∫°ng list of (src_tokens, tgt_tokens))
# test_data = list(zip(test_en_tok, test_fr_tok))

# # Sanity prints
# print(f'Train examples: {len(train_dataset)}')
# print(f'Val examples: {len(val_dataset)}')
# print(f'Test examples: {len(test_data)}')
# print('Example tokenized (src -> tgt):')
# print(train_en_tok[0][:20])
# print(train_fr_tok[0][:20])

# # Kh·ªüi t·∫°o m√¥ h√¨nh (d√πng k√≠ch th∆∞·ªõc t·ª´ vocab ƒë√£ t·∫°o)
# encoder = Encoder(len(src_vocab), EMBEDDING_DIM, HIDDEN_SIZE, NUM_LAYERS, DROPOUT)
# decoder = Decoder(len(tgt_vocab), EMBEDDING_DIM, HIDDEN_SIZE, NUM_LAYERS, DROPOUT)
# model = Seq2Seq(encoder, decoder, device).to(device)

# # Optimizer v√† loss
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# # Use ignore_index for PAD_TOKEN in CrossEntropyLoss
# criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.pad_idx)


# # (Ph·∫ßn training v·∫´n nh∆∞ c≈©)
# best_val_loss = float('inf')
# patience = 3
# patience_counter = 0

# for epoch in range(NUM_EPOCHS):
#     train_loss = train_epoch(model, train_loader, optimizer, criterion, CLIP, device)
#     val_loss = evaluate(model, val_loader, criterion, device)

#     print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
#     print(f'Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}')

#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         patience_counter = 0
#         torch.save(model.state_dict(), 'best_model.pth')
#     else:
#         patience_counter += 1
#         if patience_counter >= patience:
#             print("Early stopping!")
#             break

# bleu_score = calculate_bleu(model, test_data, src_vocab, tgt_vocab, en_tokenizer, device)
# print(f'BLEU Score: {bleu_score:.4f}')



Tokenizing...
Building vocabularies...
Train examples: 29000
Val examples: 1014
Test examples: 1071
Example tokenized (src -> tgt):
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']
['deux', 'jeunes', 'hommes', 'blancs', 'sont', 'dehors', 'pr√®s', 'de', 'buissons', '.']
Epoch 1/30
Train Loss: 4.895 | Val Loss: 4.676
Epoch 2/30
Train Loss: 4.033 | Val Loss: 4.382
Epoch 3/30
Train Loss: 3.604 | Val Loss: 4.088
Epoch 4/30
Train Loss: 3.323 | Val Loss: 3.995
Epoch 5/30
Train Loss: 3.071 | Val Loss: 3.863
Epoch 6/30
Train Loss: 2.888 | Val Loss: 3.773
Epoch 7/30
Train Loss: 2.741 | Val Loss: 3.715
Epoch 8/30
Train Loss: 2.598 | Val Loss: 3.671
Epoch 9/30
Train Loss: 2.459 | Val Loss: 3.652
Epoch 10/30
Train Loss: 2.353 | Val Loss: 3.601
Epoch 11/30
Train Loss: 2.279 | Val Loss: 3.559
Epoch 12/30
Train Loss: 2.193 | Val Loss: 3.476
Epoch 13/30
Train Loss: 2.104 | Val Loss: 3.552
Epoch 14/30
Train Loss: 2.012 | Val Loss: 3.587
Epoch 15/30
Train Loss: 1.9

In [None]:
sentence = " A man is riding a horse."
translation = translate_sentence(model, sentence, src_vocab, tgt_vocab, en_tokenizer, device)
print(f'English: {sentence}')
print(f'French: {translation}')

English:  A man is riding a horse.
French: un homme est sur cheval cheval .
