In [1]:
# =========================
# --- CELL 1: SETUP ---
# =========================
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm
import warnings

# C√†i ƒë·∫∑t th∆∞ vi·ªán tokenizers n·∫øu ch∆∞a c√≥
try:
    from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
except ImportError:
    !pip -q install tokenizers
    from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

warnings.filterwarnings("ignore")

# --- Fix seed ---
def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("‚úÖ Device:", device)

# --- PATHS (ANH S·ª¨A 2 D√íNG N√ÄY) ---
# 1) checkpoint best_transformer_en_vi.pt n·∫±m trong Kaggle Input dataset n√†o th√¨ tr·ªè v√†o ƒë√≥
CKPT_PATH = "/kaggle/input/modeltrained/best_transformer_en_vi (3).pt"

# 2) data m·ªõi train.en/train.vi n·∫±m trong Kaggle Input dataset n√†o th√¨ tr·ªè v√†o ƒë√≥
DATA_DIR = "/kaggle/input/databaitoanphu"

SRC_PATH = os.path.join(DATA_DIR, "train.en.txt")
TGT_PATH = os.path.join(DATA_DIR, "train.vi.txt")

print("CKPT_PATH:", CKPT_PATH)
print("SRC_PATH:", SRC_PATH)
print("TGT_PATH:", TGT_PATH)


‚úÖ Device: cuda
CKPT_PATH: /kaggle/input/modeltrained/best_transformer_en_vi (3).pt
SRC_PATH: /kaggle/input/databaitoanphu/train.en.txt
TGT_PATH: /kaggle/input/databaitoanphu/train.vi.txt


In [2]:
# ==================================
# --- CELL 2: DATA READING FUNCTION ---
# ==================================
def read_parallel_data(src_path, tgt_path, max_lines=None):
    pairs = []
    with open(src_path, "r", encoding="utf-8") as fsrc, open(tgt_path, "r", encoding="utf-8") as ftgt:
        for i, (s, t) in enumerate(zip(fsrc, ftgt)):
            if max_lines and i >= max_lines:
                break
            s, t = s.strip(), t.strip()
            if s and t:
                pairs.append((s, t))
    return pairs


In [3]:
# =========================
# --- CELL 3: LOAD DATA ---
# =========================
print("\n--- LOAD DATA M·ªöI (train.en / train.vi) ---")

all_pairs = read_parallel_data(SRC_PATH, TGT_PATH, max_lines=None)
print("T·ªïng s·ªë c·∫∑p c√¢u:", len(all_pairs))

# Shuffle + split 90% train, 10% val
random.shuffle(all_pairs)
split_idx = int(0.9 * len(all_pairs))
train_pairs = all_pairs[:split_idx]
val_pairs   = all_pairs[split_idx:]

print("Train pairs:", len(train_pairs))
print("Val pairs  :", len(val_pairs))



--- LOAD DATA M·ªöI (train.en / train.vi) ---
T·ªïng s·ªë c·∫∑p c√¢u: 500000
Train pairs: 450000
Val pairs  : 50000


In [4]:
# ==============================
# --- CELL 4: TRAIN TOKENIZERS ---
# ==============================
print("\n--- HU·∫§N LUY·ªÜN TOKENIZER ---")

def train_bpe_tokenizer(texts, vocab_size=8000):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    tokenizer.decoder = decoders.ByteLevel()
    
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[PAD]", "[START]", "[END]", "[UNK]"],
        show_progress=False
    )
    tokenizer.train_from_iterator(texts, trainer=trainer)
    return tokenizer

all_src_text = [p[0] for p in train_pairs + val_pairs]
all_tgt_text = [p[1] for p in train_pairs + val_pairs]

if not all_src_text:  # Dummy data n·∫øu ch∆∞a load ƒë∆∞·ª£c file
    all_src_text = ["Hello world"]
    all_tgt_text = ["Xin ch√†o"]

en_tokenizer = train_bpe_tokenizer(all_src_text, vocab_size=10000)
vi_tokenizer = train_bpe_tokenizer(all_tgt_text, vocab_size=10000)

# L·∫•y ID c√°c token ƒë·∫∑c bi·ªát
PAD_ID = en_tokenizer.token_to_id("[PAD]")
START_ID = vi_tokenizer.token_to_id("[START]")
END_ID = vi_tokenizer.token_to_id("[END]")

print("‚úÖ Tokenizer ƒë√£ s·∫µn s√†ng.")
print("PAD_ID:", PAD_ID, "START_ID:", START_ID, "END_ID:", END_ID)
print("SRC vocab:", en_tokenizer.get_vocab_size(), "TGT vocab:", vi_tokenizer.get_vocab_size())



--- HU·∫§N LUY·ªÜN TOKENIZER ---
‚úÖ Tokenizer ƒë√£ s·∫µn s√†ng.
PAD_ID: 0 START_ID: 1 END_ID: 2
SRC vocab: 10000 TGT vocab: 10000


In [5]:
# ===========================
# --- CELL 5: DATASET CLASS ---
# ===========================
class TranslationDataset(Dataset):
    def __init__(self, pairs, src_tokenizer, tgt_tokenizer, max_len=64):
        self.pairs = pairs
        self.src_tok = src_tokenizer
        self.tgt_tok = tgt_tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]

        src_ids = self.src_tok.encode(src).ids[:self.max_len]
        tgt_ids = self.tgt_tok.encode(tgt).ids[:self.max_len]

        # Th√™m START/END cho tgt
        tgt_ids = [START_ID] + tgt_ids + [END_ID]

        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=PAD_ID)
    tgt_batch = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=PAD_ID)
    return src_batch, tgt_batch


In [6]:
# ============================
# --- CELL 6: DATALOADERS ---
# ============================
BATCH_SIZE = 64

train_ds = TranslationDataset(train_pairs, en_tokenizer, vi_tokenizer, max_len=64)
val_ds   = TranslationDataset(val_pairs, en_tokenizer, vi_tokenizer, max_len=64)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, drop_last=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print("‚úÖ DataLoader ready.")
print("Train batches:", len(train_loader), "Val batches:", len(val_loader))


‚úÖ DataLoader ready.
Train batches: 7031 Val batches: 782


In [7]:
# ===================================
# --- CELL 7: MODEL COMPONENTS ---
# ===================================
# --- Rotary Positional Embeddings ---
def rotate_half(x):
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)

def apply_rotary_pos_emb(x, cos, sin):
    return (x * cos) + (rotate_half(x) * sin)

class RotaryPositionalEncoding(nn.Module):
    def __init__(self, head_dim, max_seq_len=2048):
        super().__init__()
        inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))
        t = torch.arange(max_seq_len).float()
        freqs = torch.outer(t, inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos", emb.cos()[None, None, :, :])
        self.register_buffer("sin", emb.sin()[None, None, :, :])

    def forward(self, x, seq_len):
        return self.cos[:, :, :seq_len, :], self.sin[:, :, :seq_len, :]

# --- SwiGLU ---
class SwiGLU(nn.Module):
    def __init__(self, hidden_dim, ffn_dim):
        super().__init__()
        self.w1 = nn.Linear(hidden_dim, ffn_dim)
        self.w2 = nn.Linear(hidden_dim, ffn_dim)
        self.w3 = nn.Linear(ffn_dim, hidden_dim)

    def forward(self, x):
        return self.w3(F.silu(self.w1(x)) * self.w2(x))

# --- GQA Attention ---
class GQA(nn.Module):
    def __init__(self, hidden_dim, num_heads, num_kv_heads, dropout=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.num_kv_heads = num_kv_heads
        self.head_dim = hidden_dim // num_heads
        self.num_groups = num_heads // num_kv_heads
        
        self.q_proj = nn.Linear(hidden_dim, hidden_dim)
        self.k_proj = nn.Linear(hidden_dim, num_kv_heads * self.head_dim)
        self.v_proj = nn.Linear(hidden_dim, num_kv_heads * self.head_dim)
        self.o_proj = nn.Linear(hidden_dim, hidden_dim)
        self.dropout = dropout

    def forward(self, x, enc_out=None, mask=None, rope_cos=None, rope_sin=None):
        batch, seq_len, _ = x.shape
        kv_input = enc_out if enc_out is not None else x

        q = self.q_proj(x).view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(kv_input).view(batch, -1, self.num_kv_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(kv_input).view(batch, -1, self.num_kv_heads, self.head_dim).transpose(1, 2)

        if rope_cos is not None:
            q = apply_rotary_pos_emb(q, rope_cos, rope_sin)
            k = apply_rotary_pos_emb(k, rope_cos, rope_sin)

        k = k.repeat_interleave(self.num_groups, dim=1)
        v = v.repeat_interleave(self.num_groups, dim=1)

        attn_scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        if mask is not None:
            attn_scores = attn_scores + mask

        attn = F.softmax(attn_scores, dim=-1)
        attn = F.dropout(attn, p=self.dropout, training=self.training)

        out = attn @ v
        out = out.transpose(1, 2).contiguous().view(batch, seq_len, -1)
        return self.o_proj(out)


In [8]:
# ==================================
# --- CELL 8: TRANSFORMER MODEL ---
# ==================================
class TransformerBlock(nn.Module):
    def __init__(self, hidden_dim, num_heads, num_kv_heads, dropout=0.1, is_decoder=False):
        super().__init__()
        self.norm1 = nn.RMSNorm(hidden_dim)
        self.attn = GQA(hidden_dim, num_heads, num_kv_heads, dropout)
        self.is_decoder = is_decoder
        if is_decoder:
            self.norm2 = nn.RMSNorm(hidden_dim)
            self.cross_attn = GQA(hidden_dim, num_heads, num_kv_heads, dropout)
        self.norm_ffn = nn.RMSNorm(hidden_dim)
        self.ffn = SwiGLU(hidden_dim, hidden_dim * 4)

    def forward(self, x, enc_out=None, mask=None, cross_mask=None, rope_cos=None, rope_sin=None):
        x = x + self.attn(self.norm1(x), mask=mask, rope_cos=rope_cos, rope_sin=rope_sin)
        if self.is_decoder:
            x = x + self.cross_attn(self.norm2(x), enc_out=enc_out, mask=cross_mask)
        x = x + self.ffn(self.norm_ffn(x))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, hidden_dim=256, num_layers=4, num_heads=8, num_kv_heads=4):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, hidden_dim)
        self.tgt_emb = nn.Embedding(tgt_vocab, hidden_dim)
        self.rope = RotaryPositionalEncoding(hidden_dim // num_heads)
        
        self.encoders = nn.ModuleList([
            TransformerBlock(hidden_dim, num_heads, num_kv_heads, is_decoder=False)
            for _ in range(num_layers)
        ])
        self.decoders = nn.ModuleList([
            TransformerBlock(hidden_dim, num_heads, num_kv_heads, is_decoder=True)
            for _ in range(num_layers)
        ])
        self.final_norm = nn.RMSNorm(hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, tgt_vocab)

    def forward(self, src, tgt, src_mask, tgt_mask):
        x = self.src_emb(src)
        rope_cos, rope_sin = self.rope(x, x.shape[1])
        for layer in self.encoders:
            x = layer(x, mask=src_mask, rope_cos=rope_cos, rope_sin=rope_sin)
        enc_out = x
        
        x = self.tgt_emb(tgt)
        rope_cos_tgt, rope_sin_tgt = self.rope(x, x.shape[1])
        for layer in self.decoders:
            x = layer(x, enc_out=enc_out, mask=tgt_mask, cross_mask=src_mask, rope_cos=rope_cos_tgt, rope_sin=rope_sin_tgt)
        return self.fc_out(self.final_norm(x))


In [9]:
# ============================
# --- CELL 9: INIT TRAINING ---
# ============================
def create_masks(src, tgt):
    src_mask = (src == PAD_ID).unsqueeze(1).unsqueeze(2).float() * -1e9
    batch, seq_len = tgt.shape
    causal = torch.triu(torch.full((seq_len, seq_len), float('-inf'), device=device), diagonal=1)
    tgt_pad = (tgt == PAD_ID).unsqueeze(1).unsqueeze(2).float() * -1e9
    return src_mask, causal + tgt_pad

model = Transformer(
    src_vocab=en_tokenizer.get_vocab_size(),
    tgt_vocab=vi_tokenizer.get_vocab_size(),
    hidden_dim=256, num_layers=4, num_heads=8, num_kv_heads=4
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID, label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.0001)

print("Model Initialized.")


Model Initialized.


In [10]:
# ==========================================
# --- CELL 10: LOAD BEST CHECKPOINT + RESUME ---
# ==========================================
print("\n--- LOAD best_transformer_en_vi.pt & TRAIN TI·∫æP ---")

ckpt = torch.load(CKPT_PATH, map_location=device)

# ckpt ƒë√∫ng format t·ª´ notebook c≈©:
# {"epoch":..., "model_state_dict":..., "optimizer_state_dict":..., "val_loss":...}

state_dict = ckpt["model_state_dict"]

# B·ªé 2 buffer RoPE b·ªã l·ªách shape gi·ªØa checkpoint v√† model hi·ªán t·∫°i
for k in ["rope.cos", "rope.sin"]:
    if k in state_dict:
        state_dict.pop(k)

# load v·ªõi strict=False ƒë·ªÉ ch·∫•p nh·∫≠n missing ƒë√∫ng 2 key tr√™n
model.load_state_dict(state_dict, strict=False)

# resume optimizer n·∫øu anh mu·ªën train ti·∫øp ƒë√∫ng tr·∫°ng th√°i (khuy·∫øn ngh·ªã)
if "optimizer_state_dict" in ckpt:
    optimizer.load_state_dict(ckpt["optimizer_state_dict"])
    # ƒë·∫£m b·∫£o optimizer states n·∫±m ƒë√∫ng device
    for state in optimizer.state.values():
        for k, v in state.items():
            if torch.is_tensor(v):
                state[k] = v.to(device)

start_epoch = ckpt.get("epoch", 0)
best_prev_val = ckpt.get("val_loss", None)

print(f"‚úÖ Loaded checkpoint from: {CKPT_PATH}")
print(f"   start_epoch = {start_epoch}")
print(f"   prev_best_val_loss = {best_prev_val}")



--- LOAD best_transformer_en_vi.pt & TRAIN TI·∫æP ---
‚úÖ Loaded checkpoint from: /kaggle/input/modeltrained/best_transformer_en_vi (3).pt
   start_epoch = 7
   prev_best_val_loss = 3.338132079766721


In [11]:
# =========================
# --- CELL 11: EARLY STOP ---
# =========================
import math
import torch

class EarlyStopping:
    def __init__(self, patience=3, min_delta=1e-4, mode="min"):
        """
        mode="min": metric c√†ng nh·ªè c√†ng t·ªët (val_loss)
        """
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode

        self.best = None
        self.num_bad_epochs = 0

    def _is_improvement(self, current):
        if self.best is None:
            return True
        if self.mode == "min":
            return current < (self.best - self.min_delta)
        else:
            return current > (self.best + self.min_delta)

    def step(self, current):
        if self._is_improvement(current):
            self.best = current
            return False
        else:
            self.num_bad_epochs += 1
            print(f"S·ªë epoch k√©m ch·∫•t l∆∞·ª£ng {self.num_bad_epochs}" )
            return self.num_bad_epochs >= self.patience


def save_checkpoint(path, model, optimizer, epoch, val_loss):
    torch.save({
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "val_loss": val_loss,
    }, path)


In [15]:
# ======================================
# --- CELL 12: CONTINUE TRAINING LOOP ---
# ======================================
import time 
EPOCHS_MORE = 30                 # train ti·∫øp th√™m bao nhi√™u epoch tu·ª≥ anh
early = EarlyStopping(patience=3, min_delta=1e-4, mode="min")

# n·∫øu checkpoint c√≥ best val tr∆∞·ªõc ƒë√≥ th√¨ set l√†m m·ªëc
if best_prev_val is not None:
    early.best = best_prev_val

best_path = "best_transformer_en_vi_resume.pt"

print("\n--- B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN TI·∫æP ---")

for ep in range(EPOCHS_MORE):
    epoch = start_epoch + ep  # epoch th·ª±c t√≠nh ti·∫øp
    t0 = time.time() 
    model.train()
    train_loss = 0
    pbar = tqdm.tqdm(train_loader, desc=f"Epoch {epoch+1}/{start_epoch+EPOCHS_MORE}")

    for src, tgt in pbar:
        src, tgt = src.to(device), tgt.to(device)
        tgt_input, tgt_real = tgt[:, :-1], tgt[:, 1:]
        src_mask, tgt_mask = create_masks(src, tgt_input)

        optimizer.zero_grad()
        output = model(src, tgt_input, src_mask, tgt_mask)

        loss = criterion(output.reshape(-1, output.shape[-1]), tgt_real.reshape(-1))
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        train_loss += loss.item()
        pbar.set_postfix(loss=f"{loss.item():.4f}")

    avg_train = train_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src, tgt in val_loader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_input, tgt_real = tgt[:, :-1], tgt[:, 1:]
            src_mask, tgt_mask = create_masks(src, tgt_input)
            output = model(src, tgt_input, src_mask, tgt_mask)
            val_loss += criterion(output.reshape(-1, output.shape[-1]), tgt_real.reshape(-1)).item()

    avg_val = val_loss / len(val_loader)
    epoch_time = time.time() - t0
    m, s = divmod(epoch_time, 60)
    h, m = divmod(m, 60)
    print(f"Epoch {epoch+1} | Train Loss: {avg_train:.4f} | Val Loss: {avg_val:.4f} | Time: {int(h):02d}:{int(m):02d}:{s:05.2f}")

    # save best + early stopping
    if early.best is None or avg_val < early.best - early.min_delta:
        save_checkpoint(best_path, model, optimizer, epoch+1, avg_val)
        print(f"  ‚Ü≥ Saved BEST to {best_path} (val_loss={avg_val:.4f})")

    if early.step(avg_val):
        print(f"üõë Early stopping at epoch {epoch+1}. Best val_loss = {early.best:.4f}")
        break

# (tu·ª≥ ch·ªçn) v·∫´n l∆∞u last state
torch.save(model.state_dict(), "last_transformer_en_vi_resume.pth")
print("‚úÖ Done. Saved: best_transformer_en_vi_resume.pt & last_transformer_en_vi_resume.pth")



--- B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN TI·∫æP ---


Epoch 8/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:52<00:00,  9.11it/s, loss=2.7842]


Epoch 8 | Train Loss: 3.2456 | Val Loss: 2.7712 | Time: 00:13:24.62
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.7712)


Epoch 9/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:45<00:00,  9.18it/s, loss=2.5473]


Epoch 9 | Train Loss: 2.6633 | Val Loss: 2.5809 | Time: 00:13:17.36
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.5809)


Epoch 10/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:39<00:00,  9.26it/s, loss=2.5249]


Epoch 10 | Train Loss: 2.5061 | Val Loss: 2.4935 | Time: 00:13:10.75
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.4935)


Epoch 11/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:40<00:00,  9.24it/s, loss=2.3307]


Epoch 11 | Train Loss: 2.4143 | Val Loss: 2.4400 | Time: 00:13:12.39
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.4400)


Epoch 12/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:41<00:00,  9.24it/s, loss=2.5297]


Epoch 12 | Train Loss: 2.3491 | Val Loss: 2.4013 | Time: 00:13:13.24
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.4013)


Epoch 13/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:40<00:00,  9.25it/s, loss=2.3175]


Epoch 13 | Train Loss: 2.2985 | Val Loss: 2.3724 | Time: 00:13:12.14
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.3724)


Epoch 14/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:37<00:00,  9.28it/s, loss=2.2185]


Epoch 14 | Train Loss: 2.2576 | Val Loss: 2.3510 | Time: 00:13:09.97
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.3510)


Epoch 15/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:39<00:00,  9.26it/s, loss=2.2005]


Epoch 15 | Train Loss: 2.2225 | Val Loss: 2.3366 | Time: 00:13:10.75
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.3366)


Epoch 16/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:41<00:00,  9.24it/s, loss=2.1629]


Epoch 16 | Train Loss: 2.1922 | Val Loss: 2.3200 | Time: 00:13:13.74
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.3200)


Epoch 17/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:51<00:00,  9.12it/s, loss=2.1965]


Epoch 17 | Train Loss: 2.1655 | Val Loss: 2.3063 | Time: 00:13:23.57
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.3063)


Epoch 18/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:51<00:00,  9.11it/s, loss=2.1364]


Epoch 18 | Train Loss: 2.1419 | Val Loss: 2.2936 | Time: 00:13:24.29
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2936)


Epoch 19/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:51<00:00,  9.11it/s, loss=2.1977]


Epoch 19 | Train Loss: 2.1205 | Val Loss: 2.2880 | Time: 00:13:24.53
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2880)


Epoch 20/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:49<00:00,  9.13it/s, loss=2.1918]


Epoch 20 | Train Loss: 2.1010 | Val Loss: 2.2798 | Time: 00:13:21.83
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2798)


Epoch 21/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:40<00:00,  9.25it/s, loss=2.0429]


Epoch 21 | Train Loss: 2.0832 | Val Loss: 2.2759 | Time: 00:13:11.67
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2759)


Epoch 22/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:36<00:00,  9.30it/s, loss=2.0821]


Epoch 22 | Train Loss: 2.0664 | Val Loss: 2.2651 | Time: 00:13:07.79
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2651)


Epoch 23/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:39<00:00,  9.26it/s, loss=2.0927]


Epoch 23 | Train Loss: 2.0514 | Val Loss: 2.2620 | Time: 00:13:10.70
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2620)


Epoch 24/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:38<00:00,  9.27it/s, loss=2.1392]


Epoch 24 | Train Loss: 2.0373 | Val Loss: 2.2551 | Time: 00:13:10.41
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2551)


Epoch 25/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:38<00:00,  9.27it/s, loss=2.1346]


Epoch 25 | Train Loss: 2.0238 | Val Loss: 2.2501 | Time: 00:13:10.56
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2501)


Epoch 26/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:36<00:00,  9.29it/s, loss=2.2158]


Epoch 26 | Train Loss: 2.0113 | Val Loss: 2.2468 | Time: 00:13:08.05
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2468)


Epoch 27/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:36<00:00,  9.30it/s, loss=1.9397]


Epoch 27 | Train Loss: 1.9997 | Val Loss: 2.2395 | Time: 00:13:07.92
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2395)


Epoch 28/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:37<00:00,  9.28it/s, loss=2.0281]


Epoch 28 | Train Loss: 1.9888 | Val Loss: 2.2408 | Time: 00:13:09.67
S·ªë epoch k√©m ch·∫•t l∆∞·ª£ng 1


Epoch 29/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:37<00:00,  9.28it/s, loss=1.9171]


Epoch 29 | Train Loss: 1.9783 | Val Loss: 2.2344 | Time: 00:13:09.49
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2344)


Epoch 30/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:41<00:00,  9.23it/s, loss=1.9716]


Epoch 30 | Train Loss: 1.9685 | Val Loss: 2.2322 | Time: 00:13:13.80
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2322)


Epoch 34/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:41<00:00,  9.23it/s, loss=1.9761]


Epoch 34 | Train Loss: 1.9340 | Val Loss: 2.2214 | Time: 00:13:13.32
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2214)


Epoch 35/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:41<00:00,  9.24it/s, loss=1.9008]


Epoch 35 | Train Loss: 1.9260 | Val Loss: 2.2210 | Time: 00:13:13.00
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2210)


Epoch 36/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:39<00:00,  9.25it/s, loss=1.9553]


Epoch 36 | Train Loss: 1.9186 | Val Loss: 2.2194 | Time: 00:13:11.42
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2194)


Epoch 37/37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7031/7031 [12:38<00:00,  9.26it/s, loss=1.9509]


Epoch 37 | Train Loss: 1.9117 | Val Loss: 2.2168 | Time: 00:13:10.74
  ‚Ü≥ Saved BEST to best_transformer_en_vi_resume.pt (val_loss=2.2168)
‚úÖ Done. Saved: best_transformer_en_vi_resume.pt & last_transformer_en_vi_resume.pth


In [43]:
# ==========================================
# LOAD BEST CHECKPOINT
# ==========================================
import os, torch

# CKPT_PATH = "best_transformer_en_vi_resume.pt"  (ƒë√¢y l√† tr·∫°ng th√°i khi train h·∫øt 37 epoch)
CKPT_PATH = "/kaggle/input/startstatus/best_transformer_en_vi_resume.pt" # ƒë√¢y l√† tr·∫°ng th√°i ·ªü epoch: 23 val_loss: 2.261958536589542
assert os.path.exists(CKPT_PATH), f"Kh√¥ng th·∫•y file ckpt: {CKPT_PATH}"

ckpt = torch.load(CKPT_PATH, map_location=device)

# ckpt c·ªßa anh th∆∞·ªùng l√† dict c√≥ model_state_dict...
if isinstance(ckpt, dict):
    print("‚úÖ ckpt keys:", list(ckpt.keys())[:20])
    print("epoch:", ckpt.get("epoch"), "val_loss:", ckpt.get("val_loss"))
else:
    print("‚úÖ ckpt is a raw state_dict (not a dict wrapper)")


‚úÖ ckpt keys: ['epoch', 'model_state_dict', 'optimizer_state_dict', 'val_loss']
epoch: 23 val_loss: 2.261958536589542


In [44]:
# ==========================================
# LOAD WEIGHTS INTO EXISTING MODEL
# ==========================================
# 1) N·∫øu checkpoint l√† wrapper dict
state_dict = ckpt["model_state_dict"] if isinstance(ckpt, dict) and "model_state_dict" in ckpt else ckpt

# 2) model ph·∫£i t·ªìn t·∫°i t·ª´ tr∆∞·ªõc (∆∞u ti√™n t√°i s·ª≠ d·ª•ng)
assert "model" in globals(), "Ch∆∞a th·∫•y bi·∫øn model. H√£y ch·∫°y cell t·∫°o model tr∆∞·ªõc (gi·ªëng l√∫c train)."

missing, unexpected = model.load_state_dict(state_dict, strict=False)
print("‚úÖ Loaded state_dict")
print("missing keys:", len(missing))
print("unexpected keys:", len(unexpected))

model.to(device)
model.eval()


‚úÖ Loaded state_dict
missing keys: 0
unexpected keys: 0


Transformer(
  (src_emb): Embedding(10000, 256)
  (tgt_emb): Embedding(10000, 256)
  (rope): RotaryPositionalEncoding()
  (encoders): ModuleList(
    (0-3): 4 x TransformerBlock(
      (norm1): RMSNorm((256,), eps=None, elementwise_affine=True)
      (attn): GQA(
        (q_proj): Linear(in_features=256, out_features=256, bias=True)
        (k_proj): Linear(in_features=256, out_features=128, bias=True)
        (v_proj): Linear(in_features=256, out_features=128, bias=True)
        (o_proj): Linear(in_features=256, out_features=256, bias=True)
      )
      (norm_ffn): RMSNorm((256,), eps=None, elementwise_affine=True)
      (ffn): SwiGLU(
        (w1): Linear(in_features=256, out_features=1024, bias=True)
        (w2): Linear(in_features=256, out_features=1024, bias=True)
        (w3): Linear(in_features=1024, out_features=256, bias=True)
      )
    )
  )
  (decoders): ModuleList(
    (0-3): 4 x TransformerBlock(
      (norm1): RMSNorm((256,), eps=None, elementwise_affine=True)
      (

In [20]:
# ==========================================
# LOAD TEST DATA (EN->VI)
# ==========================================
TEST_SRC_PATH = "/kaggle/input/databaitoanphu/public_test.en.txt"
TEST_TGT_PATH = "/kaggle/input/databaitoanphu/public_test.vi.txt"

assert os.path.exists(TEST_SRC_PATH), f"Kh√¥ng th·∫•y test src: {TEST_SRC_PATH}"
assert os.path.exists(TEST_TGT_PATH), f"Kh√¥ng th·∫•y test tgt: {TEST_TGT_PATH}"

# N·∫øu notebook ƒë√£ c√≥ h√†m read_parallel_data(...) th√¨ d√πng lu√¥n
if "read_parallel_data" in globals():
    test_pairs = read_parallel_data(TEST_SRC_PATH, TEST_TGT_PATH)
else:
    # fallback ƒë∆°n gi·∫£n (kh√¥ng ƒë·ª•ng v√†o logic ch√≠nh)
    test_pairs = []
    with open(TEST_SRC_PATH, "r", encoding="utf-8") as fs, open(TEST_TGT_PATH, "r", encoding="utf-8") as ft:
        for s, t in zip(fs, ft):
            s, t = s.strip(), t.strip()
            if s and t:
                test_pairs.append((s, t))

print("‚úÖ #test pairs:", len(test_pairs))
print("sample:", test_pairs[0])


‚úÖ #test pairs: 3000
sample: ('Knowledge, practices in public health service utilization among health insurance card‚Äôs holders and influencing factors in Vientiane, Lao', 'Th·ª±c tr·∫°ng ki·∫øn th·ª©c v√† th·ª±c h√†nh c·ªßa ng∆∞·ªùi c√≥ th·∫ª b·∫£o hi·ªÉm y t·∫ø trong s·ª≠ d·ª•ng d·ªãch v·ª• kh√°m ch·ªØa b·ªánh ·ªü c√°c c∆° s·ªü y t·∫ø c√¥ng v√† m·ªôt s·ªë y·∫øu t·ªë ·∫£nh h∆∞·ªüng t·∫°i t·ªânh Vi√™ng ChƒÉn, CHDCND L√†o, nƒÉm 2017')


In [45]:
# =====================================
# --- FULL: BEAM SEARCH DECODING ---
# =====================================
import torch
import torch.nn.functional as F

@torch.no_grad()
def beam_search_decode(
    model,
    src,                      # (1, src_len)
    beam_size=5,
    max_len=80,
    length_penalty=0.7
):
    """
    Beam search decode for 1 sentence.

    Assumes these globals already exist (as in your notebook):
      - device
      - PAD_ID   (src pad id)
      - START_ID (tgt start id)
      - END_ID   (tgt end id)

    Returns:
      best_tokens: List[int] (includes START_ID, may include END_ID)
    """
    model.eval()
    src = src.to(device)

    # -------------------------
    # 1) Encoder (run once)
    # -------------------------
    src_mask = (src == PAD_ID).unsqueeze(1).unsqueeze(2).float() * -1e9  # (1,1,1,src_len)

    enc = model.src_emb(src)  # (1, src_len, d)
    rope_cos, rope_sin = model.rope(enc, enc.shape[1])

    for layer in model.encoders:
        enc = layer(enc, mask=src_mask, rope_cos=rope_cos, rope_sin=rope_sin)

    # -------------------------
    # 2) Beam init
    # -------------------------
    beams = [([START_ID], 0.0)]   # list of (token_ids, log_prob_sum)
    finished = []                # finished beams that ended with END_ID

    # helper for normalized score
    def norm_score(tokens, score):
        L = max(1, len(tokens))
        return score / (L ** length_penalty)

    # -------------------------
    # 3) Decode steps
    # -------------------------
    for step in range(max_len):
        new_beams = []

        for tokens, score in beams:
            # If ended, keep it
            if tokens[-1] == END_ID:
                finished.append((tokens, score))
                new_beams.append((tokens, score))
                continue

            ys = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)  # (1, t)

            # causal mask (t, t): -inf for future positions
            tgt_mask = torch.triu(
                torch.full((ys.size(1), ys.size(1)), float("-inf"), device=device),
                diagonal=1
            )

            # ---- Decoder forward ----
            dec = model.tgt_emb(ys)  # (1, t, d)
            rope_cos_t, rope_sin_t = model.rope(dec, dec.shape[1])

            x = dec
            for layer in model.decoders:
                x = layer(
                    x,
                    enc_out=enc,
                    mask=tgt_mask,
                    cross_mask=src_mask,
                    rope_cos=rope_cos_t,
                    rope_sin=rope_sin_t
                )

            logits = model.fc_out(model.final_norm(x))[:, -1, :]  # (1, vocab)
            log_probs = F.log_softmax(logits, dim=-1)             # (1, vocab)

            topk_log_probs, topk_ids = log_probs.topk(beam_size, dim=-1)

            for k in range(beam_size):
                next_id = topk_ids[0, k].item()
                next_score = score + topk_log_probs[0, k].item()
                new_beams.append((tokens + [next_id], next_score))

        # -------------------------
        # 4) Prune (keep best beams)
        # -------------------------
        new_beams = sorted(
            new_beams,
            key=lambda x: norm_score(x[0], x[1]),
            reverse=True
        )

        beams = new_beams[:beam_size]

        # Early stop: if all current beams are finished
        if all(toks[-1] == END_ID for toks, _ in beams):
            break

    # -------------------------
    # 5) Pick best candidate
    # -------------------------
    candidates = finished if finished else beams
    best_tokens, best_score = max(candidates, key=lambda x: norm_score(x[0], x[1]))
    return best_tokens


In [25]:
# ==========================================
# QUICK SANITY CHECK: TRANSLATE FEW SAMPLES
# ==========================================
NUM_SHOW = 5

for i in range(NUM_SHOW):
    src, ref = test_pairs[i]

    # (A) N·∫øu anh ƒë√£ c√≥ h√†m translate_en2vi(text) / translate_sentence(...)
    if "translate_en2vi" in globals():
        pred = translate_en2vi(src)
    elif "translate_sentence" in globals():
        # tu·ª≥ signature c·ªßa anh: translate_sentence(model, sentence, ...)
        pred = translate_sentence(model, src)
    else:
        # (B) N·∫øu anh c√≥ beam_search_decode(model, src_ids, ...)
        assert "beam_search_decode" in globals() or "greedy_decode" in globals(), \
            "Kh√¥ng th·∫•y h√†m translate/beam/greedy c√≥ s·∫µn. Anh ch·∫°y cell ƒë·ªãnh nghƒ©a decode tr∆∞·ªõc."

        # Tokenize theo ƒë√∫ng tokenizer anh ƒë√£ d√πng
        # ƒë·ªïi t√™n tokenizer cho ƒë√∫ng notebook anh:
        src_ids = en_tokenizer.encode(src).ids
        src_ids = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)

        if "beam_search_decode" in globals():
            out_ids = beam_search_decode(model, src_ids)  # n·∫øu h√†m anh signature kh√°c th√¨ ch·ªânh 1 d√≤ng n√†y
        else:
            out_ids = greedy_decode(model, src_ids)

        # decode target
        pred = vi_tokenizer.decode(out_ids)

    print("="*80)
    print("SRC:", src)
    print("REF:", ref)
    print("PRED:", pred)


SRC: Knowledge, practices in public health service utilization among health insurance card‚Äôs holders and influencing factors in Vientiane, Lao
REF: Th·ª±c tr·∫°ng ki·∫øn th·ª©c v√† th·ª±c h√†nh c·ªßa ng∆∞·ªùi c√≥ th·∫ª b·∫£o hi·ªÉm y t·∫ø trong s·ª≠ d·ª•ng d·ªãch v·ª• kh√°m ch·ªØa b·ªánh ·ªü c√°c c∆° s·ªü y t·∫ø c√¥ng v√† m·ªôt s·ªë y·∫øu t·ªë ·∫£nh h∆∞·ªüng t·∫°i t·ªânh Vi√™ng ChƒÉn, CHDCND L√†o, nƒÉm 2017
PRED:  Ki·∫øn th·ª©c, th·ª±c h√†nh s·ª≠ d·ª•ng d·ªãch v·ª• y t·∫ø c√¥ng c·ªông c·ªßa ng∆∞·ªùi d√¢n b·∫£o hi·ªÉm y t·∫ø v√† c√°c y·∫øu t·ªë ·∫£nh h∆∞·ªüng t·∫°i Vii, L√†o Cai
SRC: Describe knowledge, practices in public health service utilization among health insurance card's holders and influencing factors in Vientiane, Lao PDR, 2017.
REF: M√¥ t·∫£ th·ª±c tr·∫°ng ki·∫øn th·ª©c, th·ª±c h√†nh c·ªßa ng∆∞·ªùi c√≥ th·∫ª b·∫£o hi·ªÉm y t·∫ø trong s·ª≠ d·ª•ng d·ªãch v·ª• kh√°m ch·ªØa b·ªánh ·ªü c√°c c∆° s·ªü y t·∫ø c√¥ng v√† m·ªôt s·ªë y·∫øu t·ªë li√™n quan t·∫°i t·ªânh Vi√™ng ChƒÉn, C·ª

In [47]:
sentence_en = "Group 1: non-obese mice."
sentence_vi = "Nh√≥m 1: Chu·ªôt kh√¥ng b√©o ph√¨."

print("SRC:", sentence_en)
print("TRG:", sentence_vi)

# 1) Encode EN -> ids tensor
src_ids = en_tokenizer.encode(sentence_en).ids
src = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)  # (1, src_len)

# 2) Beam search -> ids
out_ids = beam_search_decode(model, src, beam_size=5, max_len=80)

# 3) B·ªè START/END tr∆∞·ªõc khi decode text
if len(out_ids) > 0 and out_ids[0] == START_ID:
    out_ids = out_ids[1:]
if END_ID in out_ids:
    out_ids = out_ids[:out_ids.index(END_ID)]

pred_vi = vi_tokenizer.decode(out_ids)
print("PRED:", pred_vi)


SRC: Group 1: non-obese mice.
TRG: Nh√≥m 1: Chu·ªôt kh√¥ng b√©o ph√¨.
PRED:  Nh√≥m 1: chu·ªôt kh√¥ng thu·∫ßn ch·ªßng.


In [27]:
!pip install torchtext

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.3.0->torchtext)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.3.0->torchtext)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.3.0->torchtext)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.3.0->torchtext)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.3.0->torchtext)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from 

In [36]:
!pip -q install sacrebleu tqdm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [37]:
# ==========================================
# --- BLEU (SACREBLEU) + PROGRESS BAR ---
# ==========================================


import sacrebleu
from tqdm.auto import tqdm
import torch

MAX_EVAL = len(test_pairs)   # ho·∫∑c 2000 ƒë·ªÉ test nhanh

pred_texts = []
ref_texts  = []

for i in tqdm(range(MAX_EVAL), desc="Computing BLEU", unit="sent"):
    src, ref = test_pairs[i]

    # d√πng h√†m d·ªãch c√≥ s·∫µn c·ªßa anh (∆∞u ti√™n)
    if "translate_en2vi" in globals():
        pred = translate_en2vi(src)
    elif "translate_sentence" in globals():
        pred = translate_sentence(model, src)
    else:
        # fallback n·∫øu anh ch·ªâ c√≥ beam_search_decode/greedy_decode
        src_ids = en_tokenizer.encode(src).ids
        src_ids = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)
        out_ids = beam_search_decode(model, src_ids) if "beam_search_decode" in globals() else greedy_decode(model, src_ids)

        # b·ªè START/END n·∫øu c·∫ßn (tu·ª≥ model anh)
        if isinstance(out_ids, torch.Tensor):
            out_ids = out_ids.squeeze(0).tolist() if out_ids.ndim > 1 else out_ids.tolist()

        if len(out_ids) > 0 and out_ids[0] == START_ID:
            out_ids = out_ids[1:]
        if END_ID in out_ids:
            out_ids = out_ids[:out_ids.index(END_ID)]

        pred = vi_tokenizer.decode(out_ids)

    pred_texts.append(pred.strip())
    ref_texts.append(ref.strip())

bleu = sacrebleu.corpus_bleu(pred_texts, [ref_texts])
print("BLEU =", bleu.score)
print("detail:", bleu)


Computing BLEU:   0%|          | 0/3000 [00:00<?, ?sent/s]

BLEU = 46.57571741460038
detail: BLEU = 46.58 75.8/58.4/46.4/37.7 (BP = 0.883 ratio = 0.890 hyp_len = 89734 ref_len = 100870)


T√≠nh Bleu khi train ·ªü epoch: 23 val_loss: 2.261958536589542 

In [48]:
# ==========================================
# --- BLEU (SACREBLEU) + PROGRESS BAR ---
# ==========================================


import sacrebleu
from tqdm.auto import tqdm
import torch

MAX_EVAL = len(test_pairs)   # ho·∫∑c 2000 ƒë·ªÉ test nhanh

pred_texts = []
ref_texts  = []

for i in tqdm(range(MAX_EVAL), desc="Computing BLEU", unit="sent"):
    src, ref = test_pairs[i]

    # d√πng h√†m d·ªãch c√≥ s·∫µn c·ªßa anh (∆∞u ti√™n)
    if "translate_en2vi" in globals():
        pred = translate_en2vi(src)
    elif "translate_sentence" in globals():
        pred = translate_sentence(model, src)
    else:
        # fallback n·∫øu anh ch·ªâ c√≥ beam_search_decode/greedy_decode
        src_ids = en_tokenizer.encode(src).ids
        src_ids = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)
        out_ids = beam_search_decode(model, src_ids) if "beam_search_decode" in globals() else greedy_decode(model, src_ids)

        # b·ªè START/END n·∫øu c·∫ßn (tu·ª≥ model anh)
        if isinstance(out_ids, torch.Tensor):
            out_ids = out_ids.squeeze(0).tolist() if out_ids.ndim > 1 else out_ids.tolist()

        if len(out_ids) > 0 and out_ids[0] == START_ID:
            out_ids = out_ids[1:]
        if END_ID in out_ids:
            out_ids = out_ids[:out_ids.index(END_ID)]

        pred = vi_tokenizer.decode(out_ids)

    pred_texts.append(pred.strip())
    ref_texts.append(ref.strip())

bleu = sacrebleu.corpus_bleu(pred_texts, [ref_texts])
print("BLEU =", bleu.score)
print("detail:", bleu)


Computing BLEU:   0%|          | 0/3000 [00:00<?, ?sent/s]

BLEU = 44.982471593436834
detail: BLEU = 44.98 74.8/56.8/44.3/35.4 (BP = 0.886 ratio = 0.892 hyp_len = 89958 ref_len = 100870)
