In [1]:
# =========================
# --- CELL 1: SETUP ---
# =========================
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm
import warnings

# C√†i ƒë·∫∑t th∆∞ vi·ªán tokenizers n·∫øu ch∆∞a c√≥
try:
    from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
except ImportError:
    !pip -q install tokenizers
    from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

warnings.filterwarnings("ignore")

# --- Fix seed ---
def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("‚úÖ Device:", device)


‚úÖ Device: cuda


In [None]:


# --- CELL 2: DATA READING FUNCTION ---
def read_parallel_files(src_filename, tgt_filename):
    """ƒê·ªçc c·∫∑p file song ng·ªØ, tr·∫£ v·ªÅ list c√°c tuple (c√¢u_ngu·ªìn, c√¢u_ƒë√≠ch)"""
    # Ki·ªÉm tra ƒë∆∞·ªùng d·∫´n (h·ªó tr·ª£ c·∫£ th∆∞ m·ª•c hi·ªán t·∫°i v√† th∆∞ m·ª•c input c·ªßa Kaggle)
    possible_paths = ["./", "/kaggle/input/", "/kaggle/working/"]
    
    src_path, tgt_path = None, None
    for p in possible_paths:
        if os.path.exists(os.path.join(p, src_filename)):
            src_path = os.path.join(p, src_filename)
        if os.path.exists(os.path.join(p, tgt_filename)):
            tgt_path = os.path.join(p, tgt_filename)
            
    if not src_path or not tgt_path:
        print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y file {src_filename} ho·∫∑c {tgt_filename}. B·ªè qua.")
        return []

    print(f"üìñ ƒêang ƒë·ªçc: {src_path} v√† {tgt_path}")
    with open(src_path, 'r', encoding='utf-8') as f_src, \
         open(tgt_path, 'r', encoding='utf-8') as f_tgt:
        src_lines = [line.strip() for line in f_src.read().splitlines()]
        tgt_lines = [line.strip() for line in f_tgt.read().splitlines()]
    
    # L·ªçc b·ªè c√°c c·∫∑p c√¢u r·ªóng ho·∫∑c l·ªách d√≤ng
    pairs = []
    min_len = min(len(src_lines), len(tgt_lines))
    for i in range(min_len):
        if src_lines[i] and tgt_lines[i]:
            pairs.append((src_lines[i], tgt_lines[i]))
            
    return pairs

In [21]:
# --- CELL 3: LOAD DATA ---
print("\n--- ƒêANG T·∫¢I D·ªÆ LI·ªÜU ---")
# ƒê·∫£m b·∫£o t√™n file kh·ªõp v·ªõi file b·∫°n upload
train_pairs = read_parallel_files("/kaggle/input/maindata/train.en", "/kaggle/input/maindata/train.vi")
test_pairs = read_parallel_files("/kaggle/input/maindata/tst2013.en", "/kaggle/input/maindata/tst2013.vi")
val_pairs = read_parallel_files("/kaggle/input/maindata/tst2012.en", "/kaggle/input/maindata/tst2012.vi")
print(f"‚úÖ Train size: {len(train_pairs)}")
print(f"‚úÖ Test size: {len(test_pairs)}")

if len(train_pairs) > 0:
    print(f"üîé V√≠ d·ª• m·∫´u: {train_pairs[0]}")


--- ƒêANG T·∫¢I D·ªÆ LI·ªÜU ---
üìñ ƒêang ƒë·ªçc: /kaggle/input/maindata/train.en v√† /kaggle/input/maindata/train.vi
üìñ ƒêang ƒë·ªçc: /kaggle/input/maindata/tst2013.en v√† /kaggle/input/maindata/tst2013.vi
üìñ ƒêang ƒë·ªçc: /kaggle/input/maindata/tst2012.en v√† /kaggle/input/maindata/tst2012.vi
‚úÖ Train size: 133166
‚úÖ Test size: 1268
üîé V√≠ d·ª• m·∫´u: ('Rachel Pike : The science behind a climate headline', 'Khoa h·ªçc ƒë·∫±ng sau m·ªôt ti√™u ƒë·ªÅ v·ªÅ kh√≠ h·∫≠u')


In [22]:
# --- CELL 4: TRAIN TOKENIZERS ---
print("\n--- HU·∫§N LUY·ªÜN TOKENIZER ---")

def train_bpe_tokenizer(texts, vocab_size=8000):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    tokenizer.decoder = decoders.ByteLevel()
    
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[PAD]", "[START]", "[END]", "[UNK]"],
        show_progress=False
    )
    tokenizer.train_from_iterator(texts, trainer=trainer)
    return tokenizer

# G·ªôp text ƒë·ªÉ train tokenizer
all_src_text = [p[0] for p in train_pairs + val_pairs]
all_tgt_text = [p[1] for p in train_pairs + val_pairs]

if not all_src_text: # Dummy data n·∫øu ch∆∞a load ƒë∆∞·ª£c file
    all_src_text = ["Hello world"]
    all_tgt_text = ["Xin ch√†o"]

en_tokenizer = train_bpe_tokenizer(all_src_text, vocab_size=10000)
vi_tokenizer = train_bpe_tokenizer(all_tgt_text, vocab_size=10000)

# L·∫•y ID c√°c token ƒë·∫∑c bi·ªát
PAD_ID = en_tokenizer.token_to_id("[PAD]")
START_ID = vi_tokenizer.token_to_id("[START]")
END_ID = vi_tokenizer.token_to_id("[END]")

print("‚úÖ Tokenizer ƒë√£ s·∫µn s√†ng.")


--- HU·∫§N LUY·ªÜN TOKENIZER ---
‚úÖ Tokenizer ƒë√£ s·∫µn s√†ng.


In [23]:
# --- CELL 5: DATASET CLASS ---
class EnViDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]

def collate_fn(batch):
    en_batch, vi_batch = zip(*batch)
    
    # Encode ti·∫øng Anh (Source)
    en_enc = en_tokenizer.encode_batch(list(en_batch))
    en_ids = [e.ids for e in en_enc]
    
    # Encode ti·∫øng Vi·ªát (Target) - Th√™m START v√† END th·ªß c√¥ng
    vi_ids = []
    for text in vi_batch:
        ids = vi_tokenizer.encode(text).ids
        vi_ids.append([START_ID] + ids + [END_ID])
    
    # Padding
    max_len_en = max([len(x) for x in en_ids])
    max_len_vi = max([len(x) for x in vi_ids])
    
    padded_en = [x + [PAD_ID] * (max_len_en - len(x)) for x in en_ids]
    padded_vi = [x + [PAD_ID] * (max_len_vi - len(x)) for x in vi_ids]
    
    return torch.tensor(padded_en), torch.tensor(padded_vi)

In [30]:
# --- CELL 6: DATALOADERS ---
BATCH_SIZE = 32

train_loader = DataLoader(EnViDataset(train_pairs), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(EnViDataset(val_pairs), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"DataLoaders created. Batch size: {BATCH_SIZE}")

DataLoaders created. Batch size: 32


In [25]:
# --- CELL 7: MODEL COMPONENTS ---
# --- Rotary Positional Embeddings ---
def rotate_half(x):
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)

def apply_rotary_pos_emb(x, cos, sin):
    return (x * cos) + (rotate_half(x) * sin)

class RotaryPositionalEncoding(nn.Module):
    def __init__(self, head_dim, max_seq_len=2048):
        super().__init__()
        inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))
        t = torch.arange(max_seq_len).float()
        freqs = torch.outer(t, inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos", emb.cos()[None, :, None, :])
        self.register_buffer("sin", emb.sin()[None, :, None, :])

    def forward(self, x, seq_len):
        return self.cos[:, :seq_len, :, :], self.sin[:, :seq_len, :, :]

# --- Feed Forward (SwiGLU) ---
class SwiGLU(nn.Module):
    def __init__(self, hidden_dim, intermediate_dim):
        super().__init__()
        self.w1 = nn.Linear(hidden_dim, intermediate_dim)
        self.w2 = nn.Linear(hidden_dim, intermediate_dim)
        self.w3 = nn.Linear(intermediate_dim, hidden_dim)

    def forward(self, x):
        return self.w3(F.silu(self.w1(x)) * self.w2(x))

# --- GQA Attention ---
class GQA(nn.Module):
    def __init__(self, hidden_dim, num_heads, num_kv_heads, dropout=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.num_kv_heads = num_kv_heads
        self.head_dim = hidden_dim // num_heads
        self.num_groups = num_heads // num_kv_heads
        
        self.q_proj = nn.Linear(hidden_dim, hidden_dim)
        self.k_proj = nn.Linear(hidden_dim, num_kv_heads * self.head_dim)
        self.v_proj = nn.Linear(hidden_dim, num_kv_heads * self.head_dim)
        self.o_proj = nn.Linear(hidden_dim, hidden_dim)
        self.dropout = dropout

    def forward(self, x, enc_out=None, mask=None, rope_cos=None, rope_sin=None):
        batch, seq_len, _ = x.shape
        kv_input = enc_out if enc_out is not None else x
        kv_seq_len = kv_input.shape[1]

        q = self.q_proj(x).view(batch, seq_len, self.num_heads, self.head_dim)
        k = self.k_proj(kv_input).view(batch, kv_seq_len, self.num_kv_heads, self.head_dim)
        v = self.v_proj(kv_input).view(batch, kv_seq_len, self.num_kv_heads, self.head_dim)

        if rope_cos is not None and enc_out is None:
            q = apply_rotary_pos_emb(q, rope_cos, rope_sin)
            k = apply_rotary_pos_emb(k, rope_cos, rope_sin)

        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        if self.num_groups > 1:
            k = k[:, :, None, :, :].expand(batch, self.num_kv_heads, self.num_groups, kv_seq_len, self.head_dim).reshape(batch, self.num_heads, kv_seq_len, self.head_dim)
            v = v[:, :, None, :, :].expand(batch, self.num_kv_heads, self.num_groups, kv_seq_len, self.head_dim).reshape(batch, self.num_heads, kv_seq_len, self.head_dim)

        out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=self.dropout if self.training else 0.0)
        return self.o_proj(out.transpose(1, 2).reshape(batch, seq_len, -1))

In [26]:
# --- CELL 8: TRANSFORMER MODEL ---
class TransformerBlock(nn.Module):
    def __init__(self, hidden_dim, num_heads, num_kv_heads, dropout=0.1, is_decoder=False):
        super().__init__()
        self.norm1 = nn.RMSNorm(hidden_dim)
        self.attn = GQA(hidden_dim, num_heads, num_kv_heads, dropout)
        self.is_decoder = is_decoder
        if is_decoder:
            self.norm2 = nn.RMSNorm(hidden_dim)
            self.cross_attn = GQA(hidden_dim, num_heads, num_kv_heads, dropout)
        self.norm_ffn = nn.RMSNorm(hidden_dim)
        self.ffn = SwiGLU(hidden_dim, hidden_dim * 4)

    def forward(self, x, enc_out=None, mask=None, cross_mask=None, rope_cos=None, rope_sin=None):
        x = x + self.attn(self.norm1(x), mask=mask, rope_cos=rope_cos, rope_sin=rope_sin)
        if self.is_decoder:
            x = x + self.cross_attn(self.norm2(x), enc_out=enc_out, mask=cross_mask)
        x = x + self.ffn(self.norm_ffn(x))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, hidden_dim=256, num_layers=4, num_heads=8, num_kv_heads=4):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, hidden_dim)
        self.tgt_emb = nn.Embedding(tgt_vocab, hidden_dim)
        self.rope = RotaryPositionalEncoding(hidden_dim // num_heads)
        self.encoders = nn.ModuleList([TransformerBlock(hidden_dim, num_heads, num_kv_heads) for _ in range(num_layers)])
        self.decoders = nn.ModuleList([TransformerBlock(hidden_dim, num_heads, num_kv_heads, is_decoder=True) for _ in range(num_layers)])
        self.final_norm = nn.RMSNorm(hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, tgt_vocab)

    def forward(self, src, tgt, src_mask, tgt_mask):
        x = self.src_emb(src)
        rope_cos, rope_sin = self.rope(x, x.shape[1])
        for layer in self.encoders:
            x = layer(x, mask=src_mask, rope_cos=rope_cos, rope_sin=rope_sin)
        enc_out = x
        
        x = self.tgt_emb(tgt)
        rope_cos_tgt, rope_sin_tgt = self.rope(x, x.shape[1])
        for layer in self.decoders:
            x = layer(x, enc_out=enc_out, mask=tgt_mask, cross_mask=src_mask, rope_cos=rope_cos_tgt, rope_sin=rope_sin_tgt)
        return self.fc_out(self.final_norm(x))

In [27]:
# --- CELL 9: INIT TRAINING ---
def create_masks(src, tgt):
    src_mask = (src == PAD_ID).unsqueeze(1).unsqueeze(2).float() * -1e9
    batch, seq_len = tgt.shape
    causal = torch.triu(torch.full((seq_len, seq_len), float('-inf'), device=device), diagonal=1)
    tgt_pad = (tgt == PAD_ID).unsqueeze(1).unsqueeze(2).float() * -1e9
    return src_mask, causal + tgt_pad

model = Transformer(
    src_vocab=en_tokenizer.get_vocab_size(),
    tgt_vocab=vi_tokenizer.get_vocab_size(),
    hidden_dim=256, num_layers=4, num_heads=8, num_kv_heads=4
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID, label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.0001)

print("Model Initialized.")

Model Initialized.


In [28]:
import math
import torch

class EarlyStopping:
    def __init__(self, patience=3, min_delta=1e-4, mode="min"):
        """
        mode="min": metric c√†ng nh·ªè c√†ng t·ªët (val_loss)
        """
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode

        self.best = None
        self.num_bad_epochs = 0

    def _is_improvement(self, current):
        if self.best is None:
            return True
        if self.mode == "min":
            return current < (self.best - self.min_delta)
        else:
            return current > (self.best + self.min_delta)

    def step(self, current):
        """
        return True n·∫øu n√™n STOP
        """
        if self._is_improvement(current):
            self.best = current
            self.num_bad_epochs = 0
            return False
        else:
            self.num_bad_epochs += 1
            return self.num_bad_epochs >= self.patience


def save_checkpoint(path, model, optimizer, epoch, val_loss):
    torch.save({
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "val_loss": val_loss,
    }, path)


In [35]:
# import gc, torch
# gc.collect()
# torch.cuda.empty_cache()
# torch.cuda.ipc_collect()

In [36]:
# EPOCHS = 50
# early = EarlyStopping(patience=3, min_delta=1e-4, mode="min")
# best_path = "best_transformer_en_vi.pt"

# print("\n--- B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN ---")

# for epoch in range(EPOCHS):
#     model.train()
#     train_loss = 0
#     pbar = tqdm.tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")

#     for src, tgt in pbar:
#         src, tgt = src.to(device), tgt.to(device)
#         tgt_input, tgt_real = tgt[:, :-1], tgt[:, 1:]
#         src_mask, tgt_mask = create_masks(src, tgt_input)

#         optimizer.zero_grad()
#         output = model(src, tgt_input, src_mask, tgt_mask)
#         loss = criterion(output.reshape(-1, output.shape[-1]), tgt_real.reshape(-1))
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         optimizer.step()

#         train_loss += loss.item()
#         pbar.set_postfix(loss=f"{loss.item():.4f}")

#     # Validation
#     model.eval()
#     val_loss = 0
#     with torch.no_grad():
#         for src, tgt in val_loader:
#             src, tgt = src.to(device), tgt.to(device)
#             tgt_input, tgt_real = tgt[:, :-1], tgt[:, 1:]
#             src_mask, tgt_mask = create_masks(src, tgt_input)
#             output = model(src, tgt_input, src_mask, tgt_mask)
#             val_loss += criterion(output.reshape(-1, output.shape[-1]), tgt_real.reshape(-1)).item()

#     avg_train = train_loss / len(train_loader)
#     avg_val = val_loss / len(val_loader)
#     print(f"Epoch {epoch+1} | Train Loss: {avg_train:.4f} | Val Loss: {avg_val:.4f}")

#     # save best + early stopping
#     if early.best is None or avg_val < early.best - early.min_delta:
#         save_checkpoint(best_path, model, optimizer, epoch+1, avg_val)
#         print(f"  ‚Ü≥ Saved BEST to {best_path} (val_loss={avg_val:.4f})")

#     if early.step(avg_val):
#         print(f"üõë Early stopping at epoch {epoch+1}. Best val_loss = {early.best:.4f}")
#         break

# # (tu·ª≥ ch·ªçn) v·∫´n l∆∞u last state
# torch.save(model.state_dict(), "last_transformer_en_vi.pth")



--- B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN ---


Epoch 1/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4162/4162 [04:58<00:00, 13.95it/s, loss=3.8881]


Epoch 1 | Train Loss: 4.1989 | Val Loss: 3.7663
  ‚Ü≥ Saved BEST to best_transformer_en_vi.pt (val_loss=3.7663)


Epoch 2/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4162/4162 [05:00<00:00, 13.87it/s, loss=3.4853]


Epoch 2 | Train Loss: 3.4529 | Val Loss: 3.5373
  ‚Ü≥ Saved BEST to best_transformer_en_vi.pt (val_loss=3.5373)


Epoch 3/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4162/4162 [04:59<00:00, 13.92it/s, loss=3.0734]


Epoch 3 | Train Loss: 3.1974 | Val Loss: 3.4424
  ‚Ü≥ Saved BEST to best_transformer_en_vi.pt (val_loss=3.4424)


Epoch 4/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4162/4162 [04:58<00:00, 13.96it/s, loss=2.9062]


Epoch 4 | Train Loss: 3.0443 | Val Loss: 3.3789
  ‚Ü≥ Saved BEST to best_transformer_en_vi.pt (val_loss=3.3789)


Epoch 5/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4162/4162 [04:58<00:00, 13.96it/s, loss=3.2277]


Epoch 5 | Train Loss: 2.9291 | Val Loss: 3.3524
  ‚Ü≥ Saved BEST to best_transformer_en_vi.pt (val_loss=3.3524)


Epoch 6/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4162/4162 [04:59<00:00, 13.92it/s, loss=2.9170]


Epoch 6 | Train Loss: 2.8357 | Val Loss: 3.3452
  ‚Ü≥ Saved BEST to best_transformer_en_vi.pt (val_loss=3.3452)


Epoch 7/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4162/4162 [04:57<00:00, 13.98it/s, loss=3.0811]


Epoch 7 | Train Loss: 2.7559 | Val Loss: 3.3381
  ‚Ü≥ Saved BEST to best_transformer_en_vi.pt (val_loss=3.3381)


Epoch 8/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4162/4162 [04:57<00:00, 13.97it/s, loss=2.8564]


Epoch 8 | Train Loss: 2.6860 | Val Loss: 3.3418


Epoch 9/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4162/4162 [04:59<00:00, 13.91it/s, loss=2.7402]


Epoch 9 | Train Loss: 2.6237 | Val Loss: 3.3579


Epoch 10/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4162/4162 [04:59<00:00, 13.91it/s, loss=2.5059]


Epoch 10 | Train Loss: 2.5656 | Val Loss: 3.3702
üõë Early stopping at epoch 10. Best val_loss = 3.3381


In [37]:
import torch
import torch.nn.functional as F

@torch.no_grad()
def beam_search_decode(
    model,
    src_ids: torch.Tensor,          # shape: (1, src_len)
    src_mask: torch.Tensor,         # shape: (1, 1, 1, src_len) ho·∫∑c t∆∞∆°ng th√≠ch model
    beam_size: int = 5,
    max_len: int = 100,
    len_penalty: float = 0.6,       # >0 gi√∫p tr√°nh ∆∞u ti√™n c√¢u qu√° ng·∫Øn
    temperature: float = 1.0,
):
    """
    Tr·∫£ v·ªÅ: best_hyp_ids (List[int]) g·ªìm [START, ..., END] (c√≥ th·ªÉ kh√¥ng c√≥ END n·∫øu max_len).
    Ghi ch√∫: code n√†y ph√π h·ª£p v·ªõi model.forward(src, tgt, src_mask, tgt_mask) nh∆∞ notebook c·ªßa anh.
    """
    device = src_ids.device
    model.eval()

    # m·ªói beam: (token_ids, sum_logprob)
    beams = [([START_ID], 0.0)]
    finished = []

    for _ in range(max_len):
        new_beams = []

        for tok_ids, score in beams:
            # n·∫øu ƒë√£ k·∫øt th√∫c th√¨ ƒë∆∞a v√†o finished
            if tok_ids[-1] == END_ID:
                finished.append((tok_ids, score))
                continue

            tgt = torch.tensor([tok_ids], device=device)  # (1, tlen)
            tlen = tgt.size(1)

            # causal mask gi·ªëng CELL 11
            tgt_mask = torch.triu(
                torch.full((tlen, tlen), float('-inf'), device=device),
                diagonal=1
            )

            out = model(src_ids, tgt, src_mask, tgt_mask)  # (1, tlen, vocab)
            logits = out[0, -1, :] / max(temperature, 1e-8)  # (vocab,)

            log_probs = F.log_softmax(logits, dim=-1)       # (vocab,)
            topk_log_probs, topk_ids = torch.topk(log_probs, k=beam_size)

            for lp, wid in zip(topk_log_probs.tolist(), topk_ids.tolist()):
                new_tok_ids = tok_ids + [wid]
                new_score = score + lp
                new_beams.append((new_tok_ids, new_score))

        if not new_beams:
            break

        # length normalization
        def normed(s, length):
            # length t√≠nh theo s·ªë token sinh ra (kh√¥ng t√≠nh START)
            denom = ((5 + length) / 6) ** len_penalty
            return s / denom

        # gi·ªØ top beam_size
        new_beams.sort(key=lambda x: normed(x[1], max(1, len(x[0]) - 1)), reverse=True)
        beams = new_beams[:beam_size]

        # n·∫øu t·∫•t c·∫£ beam ƒë·ªÅu END th√¨ d·ª´ng
        if all(b[0][-1] == END_ID for b in beams):
            finished.extend(beams)
            break

    if finished:
        finished.sort(key=lambda x: x[1] / (((5 + max(1, len(x[0]) - 1)) / 6) ** len_penalty), reverse=True)
        return finished[0][0]
    return beams[0][0]


def translate_beam(sentence: str, beam_size=8, max_len=100, len_penalty=0.6):
    model.eval()
    with torch.no_grad():
        src = torch.tensor([en_tokenizer.encode(sentence).ids]).to(device)
        src_mask = (src == PAD_ID).unsqueeze(1).unsqueeze(2).float() * -1e9

        best_ids = beam_search_decode(
            model=model,
            src_ids=src,
            src_mask=src_mask,
            beam_size=beam_size,
            max_len=max_len,
            len_penalty=len_penalty,
        )

        # b·ªè START, v√† (n·∫øu c√≥) c·∫Øt t·∫°i END
        if END_ID in best_ids:
            best_ids = best_ids[1:best_ids.index(END_ID)]
        else:
            best_ids = best_ids[1:]

        return vi_tokenizer.decode(best_ids)


In [38]:

for i in range(5):
    if len(test_pairs) > 0:
        idx = random.randint(0, len(test_pairs)-1)
        en_txt, vi_txt = test_pairs[idx]
        print(f"üîπ Input:  {en_txt}\nüî∏ Target: {vi_txt}\nüöÄ Model:  {translate_beam(en_txt)}\n{'-'*50}")

üîπ Input:  And he said that he needed those guns because of the trauma he &apos;d experienced as a young boy .
üî∏ Target: V√† anh ta n√≥i r·∫±ng anh ta c·∫ßn nh·ªØng c√¢y s√∫ng n√†y b·ªüi v√¨ nh·ªØng t·ªïn th∆∞∆°ng m√† anh ƒë√£ tr·∫£i qua trong qu√° kh·ª© khi l√† m·ªôt ƒë·ª©a tr·∫ª .
üöÄ Model:   V√† √¥ng n√≥i r·∫±ng √¥ng c·∫ßn nh·ªØng s√∫ng ƒë√≥ v√¨ nh·ªØng ch·∫•n th∆∞∆°ng m√† √¥ng ƒë√£ tr·∫£i qua khi c√≤n l√† m·ªôt c·∫≠u b√© .
--------------------------------------------------
üîπ Input:  Am I South Korean or North Korean ?
üî∏ Target: T√¥i l√† ng∆∞·ªùi Nam Tri·ªÅu Ti√™n hay B·∫Øc Tri·ªÅu Ti√™n ?
üöÄ Model:   T√¥i l√† ng∆∞·ªùi Nam H√†n hay B·∫Øc H√†n hay B·∫Øc H√†n ?
--------------------------------------------------
üîπ Input:  And so just as the womb entirely envelopes the embryo , which grows within it , the divine matrix of compassion nourishes the entire existence .
üî∏ Target: V√† b·ªüi v√¨ t·ª≠ cung bao b·ªçc ho√†n to√†n ph√¥i thai ƒëang ph√°t tri·ªÉn trong l√≤ng n√≥

In [None]:
# Test th·ªß c√¥ng 1 c√¢u ri√™ng
custom_sentence = "I really like this model."
print(f"\nüöÄ Custom Test Input: {custom_sentence}")
print(f"‚úÖ Model Translation: {translate_beam(custom_sentence)}")

In [39]:
!pip install sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.2.0 sacrebleu-2.5.1


In [41]:
import sacrebleu
import random
from tqdm import tqdm # Thanh hi·ªÉn th·ªã ti·∫øn ƒë·ªô

def calculate_bleu(data_pairs, num_samples=100):
    print(f"--- üìä ƒêANG T√çNH ƒêI·ªÇM BLEU TR√äN {num_samples} M·∫™U ---")
    
    # Ch·ªçn ng·∫´u nhi√™n m·∫´u ƒë·ªÉ test (ho·∫∑c l·∫•y h·∫øt n·∫øu num_samples=None)
    if num_samples is not None and num_samples < len(data_pairs):
        samples = random.sample(data_pairs, num_samples)
    else:
        samples = data_pairs

    preds = [] # C√°c c√¢u m√°y d·ªãch
    refs = []  # C√°c c√¢u ƒë√°p √°n chu·∫©n

    # B·∫Øt ƒë·∫ßu d·ªãch
    for en_txt, vi_txt in tqdm(samples):
        # D·ªãch c√¢u ti·∫øng Anh
        pred_sent = translate_beam(en_txt)
        
        preds.append(pred_sent)
        refs.append(vi_txt) # Sacrebleu nh·∫≠n list c√°c string cho refs

    # T√≠nh ƒëi·ªÉm BLEU
    # refs c·∫ßn ƒë∆∞·ª£c b·ªçc trong list v√¨ 1 c√¢u input c√≥ th·ªÉ c√≥ nhi·ªÅu c√¢u target (·ªü ƒë√¢y ta c√≥ 1)
    bleu = sacrebleu.corpus_bleu(preds, [refs])
    
    return bleu.score

# --- CH·∫†Y T√çNH ƒêI·ªÇM ---
# B·∫°n c√≥ th·ªÉ tƒÉng s·ªë l∆∞·ª£ng m·∫´u l√™n len(test_pairs) ƒë·ªÉ ch√≠nh x√°c h∆°n (s·∫Ω ch·∫°y l√¢u h∆°n)
score = calculate_bleu(test_pairs, num_samples=100)

print(f"\nüåü ƒêI·ªÇM BLEU C·ª¶A MODEL: {score:.2f}")

--- üìä ƒêANG T√çNH ƒêI·ªÇM BLEU TR√äN 100 M·∫™U ---


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [09:54<00:00,  5.94s/it]


üåü ƒêI·ªÇM BLEU C·ª¶A MODEL: 28.21





B√†i to√°n ph·ª•


In [None]:
CKPT_PATH = "/kaggle/input/YOUR_CKPT_DATASET/best_transformer_en_vi.pt"

# 2) data m·ªõi train.en/train.vi n·∫±m trong Kaggle Input dataset n√†o th√¨ tr·ªè v√†o ƒë√≥
DATA_DIR = "/kaggle/input/YOUR_NEW_TRAIN_DATASET"

SRC_PATH = os.path.join(DATA_DIR, "train.en")
TGT_PATH = os.path.join(DATA_DIR, "train.vi")

print("CKPT_PATH:", CKPT_PATH)
print("SRC_PATH:", SRC_PATH)
print("TGT_PATH:", TGT_PATH)

In [None]:
# =========================
# --- CELL 3: LOAD DATA ---
# =========================
print("\n--- LOAD DATA M·ªöI (train.en / train.vi) ---")

all_pairs = read_parallel_data(SRC_PATH, TGT_PATH, max_lines=None)
print("T·ªïng s·ªë c·∫∑p c√¢u:", len(all_pairs))

# Shuffle + split 90% train, 10% val
random.shuffle(all_pairs)
split_idx = int(0.9 * len(all_pairs))
train_pairs = all_pairs[:split_idx]
val_pairs   = all_pairs[split_idx:]

print("Train pairs:", len(train_pairs))
print("Val pairs  :", len(val_pairs))


In [None]:
# ==========================================
# --- CELL 10: LOAD BEST CHECKPOINT + RESUME ---
# ==========================================
print("\n--- LOAD best_transformer_en_vi.pt & TRAIN TI·∫æP ---")

ckpt = torch.load(CKPT_PATH, map_location=device)

# ckpt ƒë√∫ng format t·ª´ notebook c≈©:
# {"epoch":..., "model_state_dict":..., "optimizer_state_dict":..., "val_loss":...}
model.load_state_dict(ckpt["model_state_dict"])

# resume optimizer n·∫øu anh mu·ªën train ti·∫øp ƒë√∫ng tr·∫°ng th√°i (khuy·∫øn ngh·ªã)
if "optimizer_state_dict" in ckpt:
    optimizer.load_state_dict(ckpt["optimizer_state_dict"])
    # ƒë·∫£m b·∫£o optimizer states n·∫±m ƒë√∫ng device
    for state in optimizer.state.values():
        for k, v in state.items():
            if torch.is_tensor(v):
                state[k] = v.to(device)

start_epoch = ckpt.get("epoch", 0)
best_prev_val = ckpt.get("val_loss", None)

print(f"‚úÖ Loaded checkpoint from: {CKPT_PATH}")
print(f"   start_epoch = {start_epoch}")
print(f"   prev_best_val_loss = {best_prev_val}")


In [None]:
# --- CELL 1: SETUP ---
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm
import warnings

# C√†i ƒë·∫∑t th∆∞ vi·ªán tokenizers n·∫øu ch∆∞a c√≥
try:
    import tokenizers
except ImportError:
    os.system('pip install tokenizers')
    import tokenizers

from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

# C·∫•u h√¨nh thi·∫øt b·ªã v√† Random Seed
warnings.filterwarnings("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üîπ ƒêang s·ª≠ d·ª•ng thi·∫øt b·ªã: {device}")

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)