In [3]:
!pip install evaluate datasets

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==20

# Import library

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import tqdm
import evaluate

2025-04-21 13:25:17.448567: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745241917.652473      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745241917.711031      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Load dataset

In [5]:
ds = load_dataset("thainq107/iwslt2015-en-vi")
train_data, valid_data, test_data = ds["train"], ds["validation"], ds["test"]

README.md:   0%|          | 0.00/522 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133317 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [6]:
ds 

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

# Train BPE tokenizers

In [9]:
# English tokenizer
tokenizer_en = Tokenizer(BPE())
tokenizer_en.pre_tokenizer = Whitespace()
trainer_en = BpeTrainer(
    vocab_size=30000,
    special_tokens=["<unk>", "<p ad>", "<sos>", "<eos>"]
)
tokenizer_en.train_from_iterator(train_data["en"], trainer=trainer_en)

# Vietnamese tokenizer
tokenizer_vi = Tokenizer(BPE())
tokenizer_vi.pre_tokenizer = Whitespace()
trainer_vi = BpeTrainer(
    vocab_size=30000,
    special_tokens=["<unk>", "<pad>", "<sos>", "<eos>"]
)
tokenizer_vi.train_from_iterator(train_data["vi"], trainer=trainer_vi)









# Tokenize & numericalize

In [10]:
def tokenize_example(example, tokenizer_src, tokenizer_trg, sos_token, eos_token, max_length=1000):
    # Source = English, Target = Vietnamese
    src_ids = tokenizer_src.encode(example["en"]).ids[:max_length]
    trg_ids = tokenizer_trg.encode(example["vi"]).ids[:max_length]
    # Add <sos> and <eos>
    src = [tokenizer_src.token_to_id(sos_token)] + src_ids + [tokenizer_src.token_to_id(eos_token)]
    trg = [tokenizer_trg.token_to_id(sos_token)] + trg_ids + [tokenizer_trg.token_to_id(eos_token)]
    return {"en_ids": src, "vi_ids": trg}

fn_kwargs = {
    "tokenizer_src": tokenizer_en,
    "tokenizer_trg": tokenizer_vi,
    "sos_token": "<sos>",
    "eos_token": "<eos>",
    "max_length": 30
}

# Note: we no longer remove_columns=["en","vi"]
train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data  = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

# Convert lists to torch.Tensor

In [11]:
def to_tensor(example):
    return {
        "en_ids": torch.tensor(example["en_ids"], dtype=torch.long),
        "vi_ids": torch.tensor(example["vi_ids"], dtype=torch.long)
    }

# after tokenization (lists of ints in en_ids/vi_ids):
train_data = train_data.with_format(
    type="torch",
    columns=["en_ids","vi_ids"],
    output_all_columns=True
)
valid_data = valid_data.with_format(type="torch", columns=["en_ids","vi_ids"], output_all_columns=True)
test_data  = test_data.with_format(type="torch", columns=["en_ids","vi_ids"], output_all_columns=True)


# DataLoader setup

In [12]:
def get_collate_fn(pad_id):
    def collate_fn(batch):
        src = [ex["en_ids"] for ex in batch]
        trg = [ex["vi_ids"] for ex in batch]
        src = nn.utils.rnn.pad_sequence(src, padding_value=pad_id)
        trg = nn.utils.rnn.pad_sequence(trg, padding_value=pad_id)
        return {"en_ids": src, "vi_ids": trg}
    return collate_fn

pad_id = tokenizer_en.token_to_id("<pad>")
batch_size = 32

train_loader = torch.utils.data.DataLoader(
    train_data, batch_size=batch_size,
    collate_fn=get_collate_fn(pad_id), shuffle=True
)
valid_loader = torch.utils.data.DataLoader(
    valid_data, batch_size=batch_size,
    collate_fn=get_collate_fn(pad_id)
)
test_loader  = torch.utils.data.DataLoader(
    test_data,  batch_size=batch_size,
    collate_fn=get_collate_fn(pad_id)
)

# Model Seq2Seq

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src_len, batch_size]
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch_size]
        input = input.unsqueeze(0)
        # input = [1, batch_size]
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [1, batch_size, hid_dim]
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch_size, output_dim]
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        assert encoder.rnn.hidden_size == decoder.rnn.hidden_size
        assert encoder.rnn.num_layers == decoder.rnn.num_layers
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [src_len, batch_size], trg = [trg_len, batch_size]
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        vocab_size = self.decoder.fc_out.out_features
        outputs = torch.zeros(trg_len, batch_size, vocab_size).to(self.device)

        hidden, cell = self.encoder(src)
        input = trg[0, :]  # <sos>

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1

        return outputs


# Initialize Model + Training Setup

In [26]:
import os
# right:
INPUT_DIM  = tokenizer_en.get_vocab_size()
OUTPUT_DIM = tokenizer_vi.get_vocab_size()
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM     = 512
N_LAYERS    = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)

def init_weights(m):
    for p in m.parameters():
        nn.init.uniform_(p.data, -0.08, 0.08)
model.apply(init_weights)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)

CHECKPOINT_PATH = '/kaggle/input/bpe_seq2seq/pytorch/default/1/best-model.pt'  

if os.path.isfile(CHECKPOINT_PATH):
    model.load_state_dict(torch.load(CHECKPOINT_PATH, map_location=DEVICE))
    print(f"✔ Loaded checkpoint from {CHECKPOINT_PATH}, resuming training.")
else:
    # only initialize weights if there's no checkpoint
    def init_weights(m):
        for p in m.parameters():
            nn.init.uniform_(p.data, -0.08, 0.08)
    model.apply(init_weights)
    print("✗ No checkpoint found — training from scratch.")

print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

✔ Loaded checkpoint from /kaggle/input/bpe_seq2seq/pytorch/default/1/best-model.pt, resuming training.
Trainable parameters: 38,106,416


  model.load_state_dict(torch.load(CHECKPOINT_PATH, map_location=DEVICE))


# Training and Evaluation Functions

In [27]:
def train_fn(model, loader, optimizer, criterion, clip, device):
    model.train()
    epoch_loss = 0
    for batch in loader:
        src = batch["en_ids"].to(device)
        trg = batch["vi_ids"].to(device)
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio=0.5)
        # output = [trg_len, batch_size, vocab_size]
        output_dim = output.shape[-1]
        out = output[1:].view(-1, output_dim)
        tgt = trg[1:].view(-1)
        loss = criterion(out, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

def eval_fn(model, loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in loader:
            src = batch["en_ids"].to(device)
            trg = batch["vi_ids"].to(device)
            output = model(src, trg, teacher_forcing_ratio=0.5)  # no teacher forcing
            output_dim = output.shape[-1]
            out = output[1:].view(-1, output_dim)
            tgt = trg[1:].view(-1)
            loss = criterion(out, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(loader)

# Training loops

In [28]:
N_EPOCHS = 2
CLIP     = 1.0

best_valid = float('inf')
for epoch in range(N_EPOCHS):
    train_loss = train_fn(model, train_loader, optimizer, criterion, CLIP, DEVICE)
    valid_loss = eval_fn(model, valid_loader, criterion, DEVICE)
    if valid_loss < best_valid:
        best_valid = valid_loss
        torch.save(model.state_dict(), 'best-model.pt')
    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):.3f}")
    print(f"          | Val   Loss: {valid_loss:.3f} | Val   PPL: {np.exp(valid_loss):.3f}")


Epoch 1 | Train Loss: 3.273 | Train PPL: 26.382
          | Val   Loss: 4.087 | Val   PPL: 59.551
Epoch 2 | Train Loss: 3.245 | Train PPL: 25.669
          | Val   Loss: 4.108 | Val   PPL: 60.810


# Testing + BLEU

In [29]:
model.load_state_dict(torch.load('best-model.pt'))
test_loss = eval_fn(model, test_loader, criterion, DEVICE)
print(f"Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):.3f}")

# Translation helper
def translate_sentence(
    sentence, model, tokenizer_src, tokenizer_trg,
    lower=True, sos_token="<sos>", eos_token="<eos>",
    device=DEVICE, max_len=30
):
    model.eval()
    tokens = sentence.split()  # already whitespace-tokenized
    tokens = [sos_token] + tokens + [eos_token]
    src_ids = tokenizer_src.encode(" ".join(tokens)).ids
    src_tensor = torch.LongTensor(src_ids).unsqueeze(1).to(device)
    hidden, cell = model.encoder(src_tensor)

    outputs = [tokenizer_trg.token_to_id(sos_token)]
    for _ in range(max_len):
        prev = torch.LongTensor([outputs[-1]]).to(device)
        pred, hidden, cell = model.decoder(prev, hidden, cell)
        top1 = pred.argmax(1).item()
        outputs.append(top1)
        if top1 == tokenizer_trg.token_to_id(eos_token):
            break

    return tokenizer_trg.decode(outputs)

  model.load_state_dict(torch.load('best-model.pt'))


Test Loss: 4.036 | Test PPL: 56.574


In [30]:
# Compute BLEU on test set
bleu = evaluate.load("bleu")
predictions = []
references  = []

for ex in tqdm.tqdm(test_data):
    # ex["en"] & ex["vi"] are still present because we didn't remove them
    pred = translate_sentence(
        ex["en"], model,
        tokenizer_en, tokenizer_vi,
        device=DEVICE
    )
    predictions.append(pred)
    references.append([ex["vi"]])

results = bleu.compute(predictions=predictions, references=references)
print(f"BLEU score = {results['bleu']:.4f}")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

100%|██████████| 1268/1268 [00:20<00:00, 61.36it/s]


BLEU score = 0.0685


# test model

In [31]:
sentence = test_data[0]["en"]
expected_translation = test_data[0]["vi"]
print("Source (English):", sentence)
print("Expected Translation (Vietnamese):", expected_translation)
translation = translate_sentence(sentence, model, tokenizer_en, tokenizer_vi,device=DEVICE)
print("Model Translation:", translation)

Source (English): When I was little , I thought my country was the best on the planet , and I grew up singing a song called &quot; Nothing To Envy . &quot;
Expected Translation (Vietnamese): Khi tôi còn nhỏ , Tôi nghĩ rằng BắcTriều Tiên là đất nước tốt nhất trên thế giới và tôi thường hát bài &quot; Chúng ta chẳng có gì phải ghen tị . &quot;
Model Translation: Khi tôi nhỏ , tôi nghĩ rằng tôi là đất nước là một thế giới tốt đẹp nhất và tôi đã hát rằng tôi gọi là & quot ; Ain


In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Print total GPU memory
    total_memory = torch.cuda.get_device_properties(0).total_memory
    # Print reserved memory
    reserved_memory = torch.cuda.memory_reserved(0)
    # Print allocated memory
    allocated_memory = torch.cuda.memory_allocated(0)
    # Calculate and print free memory
    free_memory = total_memory - reserved_memory
    
    print(f"Total GPU memory: {total_memory / 1e9:.2f} GB")
    print(f"Reserved memory: {reserved_memory / 1e9:.2f} GB")
    print(f"Allocated memory: {allocated_memory / 1e9:.2f} GB")
    print(f"Free memory: {free_memory / 1e9:.2f} GB")
else:
    print("CUDA is not available")

In [None]:
# Free cached memory
torch.cuda.empty_cache()

# Delete unused variables
import gc
gc.collect()
torch.cuda.empty_cache()