## Import libary và load datasets

In [1]:
import os
from tokenizers import Tokenizer, pre_tokenizers, trainers, models
from datasets import load_dataset
import torch
import torch.nn as nn

In [2]:
ds = load_dataset("thainq107/iwslt2015-en-vi")

README.md:   0%|          | 0.00/522 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133317 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [4]:
ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## Encoding

### Defined toknize

In [5]:
# Word-based Tokenizer
tokenizer_en = Tokenizer(models.WordLevel(unk_token="<unk>")) # Tạo 2 tokenizer
tokenizer_vi = Tokenizer(models.WordLevel(unk_token="<unk>"))
tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace() # Tách dựa trên khoảng trắng
tokenizer_vi.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.WordLevelTrainer(
    vocab_size=15000, 
    min_frequency=2,  # xác định tần suất xuất hiện tối thiểu để được đưa vào vocab
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)

# Train tokenizer
tokenizer_en.train_from_iterator(ds["train"]["en"], trainer) # Vừa xây dựng vocab vừa tokenize
tokenizer_vi.train_from_iterator(ds["train"]["vi"], trainer)

# Save tokenizer
save_dir = "/kaggle/working/transformer"
os.makedirs(save_dir, exist_ok=True)

tokenizer_en.save(os.path.join(save_dir, "tokenizer_en.json"))
tokenizer_vi.save(os.path.join(save_dir, "tokenizer_vi.json"))

### Encoding

In [6]:
MAX_LENGTH = 75
from transformers import  PreTrainedTokenizerFast
tokenizer_en = PreTrainedTokenizerFast(tokenizer_file="/kaggle/working/transformer/tokenizer_en.json", unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos")
tokenizer_vi = PreTrainedTokenizerFast(tokenizer_file="/kaggle/working/transformer/tokenizer_vi.json",
                                       unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [7]:
added_tokens_encoder = tokenizer_en.added_tokens_encoder
added_tokens_encoder['<pad>']

0

In [8]:
def preprocess_function(examples):
    src_text = examples["en"]
    tgt_text = ["<bos> " + text + " <eos>" for text in examples["vi"]]

    src_encodings = tokenizer_en(
        src_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    tgt_encodings = tokenizer_vi(
        tgt_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)

    return {
        # Trả về list, không phải tensor
        "input_ids": src_encodings["input_ids"],
        "labels": tgt_encodings["input_ids"],  # Trả về list, không phải tensor
    }


# Áp dụng map() và set_format("torch") để tự động chuyển thành tensor
preprocessed_train = ds['train'].map(preprocess_function, batched=True)
preprocessed_val = ds['validation'].map(preprocess_function, batched=True)
preprocessed_test = ds['test'].map(preprocess_function, batched=True)

# Định dạng dữ liệu dưới dạng torch.Tensor
preprocessed_train.set_format(type="torch", columns=["input_ids", "labels"])
preprocessed_val.set_format(type="torch", columns=["input_ids", "labels"])
preprocessed_test.set_format(type="torch", columns=["input_ids", "labels"])

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [9]:
# Kiểm tra đầu ra
print(preprocessed_train[0])
print(type(preprocessed_train[0]["input_ids"]))  # Phải là torch.Tensor

{'input_ids': tensor([6675,    1,   57,   60,  339,  604,   13,  744, 5643,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]), 'labels': tensor([   2, 1960,   66, 1157,  131,    8,  376,  113,   38,  417,  735,    3,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 

## Modeling

#### Example padding mask

In [10]:
import torch
import torch.nn.functional as F

# Từ điển ánh xạ từ thành ID
vocab = {"Hello": 1, "world": 2, "!": 3, "<PAD>": 0}

# Câu đã tokenized
sentence = ["Hello", "world", "!", "<PAD>", "<PAD>"]
sentence_ids = torch.tensor([[vocab[word] for word in sentence]])

# Tạo Padding Mask (True nếu là <PAD>)
padding_mask = (sentence_ids == vocab["<PAD>"])
print("Padding Mask:\n", padding_mask)

# Tạo ma trận Attention Scores ngẫu nhiên (giả lập)
seq_len = sentence_ids.shape[1]
attention_scores = torch.rand(seq_len, seq_len)  # Ma trận Attention (5x5)

print("\nAttention Scores trước khi áp dụng mask:\n", attention_scores)

# Áp dụng padding mask: Gán -inf cho vị trí <PAD> để loại bỏ khi tính softmax
attention_scores = attention_scores.masked_fill(padding_mask, float('-inf'))

print("\nAttention Scores sau khi áp dụng mask:\n", attention_scores)

# Tính Softmax để thấy sự khác biệt
attention_probs = F.softmax(attention_scores, dim=-1)
print("\nAttention Weights sau Softmax:\n", attention_probs)

Padding Mask:
 tensor([[False, False, False,  True,  True]])

Attention Scores trước khi áp dụng mask:
 tensor([[0.6813, 0.7242, 0.1787, 0.7231, 0.4579],
        [0.3532, 0.7621, 0.3309, 0.6635, 0.0976],
        [0.2634, 0.5605, 0.6592, 0.0744, 0.3951],
        [0.3802, 0.1098, 0.9905, 0.3906, 0.0272],
        [0.8086, 0.1569, 0.3764, 0.0476, 0.9944]])

Attention Scores sau khi áp dụng mask:
 tensor([[0.6813, 0.7242, 0.1787,   -inf,   -inf],
        [0.3532, 0.7621, 0.3309,   -inf,   -inf],
        [0.2634, 0.5605, 0.6592,   -inf,   -inf],
        [0.3802, 0.1098, 0.9905,   -inf,   -inf],
        [0.8086, 0.1569, 0.3764,   -inf,   -inf]])

Attention Weights sau Softmax:
 tensor([[0.3775, 0.3941, 0.2284, 0.0000, 0.0000],
        [0.2871, 0.4321, 0.2808, 0.0000, 0.0000],
        [0.2610, 0.3513, 0.3877, 0.0000, 0.0000],
        [0.2775, 0.2117, 0.5108, 0.0000, 0.0000],
        [0.4608, 0.2401, 0.2991, 0.0000, 0.0000]])


### Transformer

In [11]:
def generate_square_subsequent_mask(sz, device):
    mask = torch.triu(torch.ones(sz, sz, device=device)) == 1  
    mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt, tokenizer_en, tokenizer_vi, device):
    # src = tgt = [batch_size, seq_len]
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]
    mask_decoder = generate_square_subsequent_mask(tgt_seq_len, device)
    mask_encoder = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
    
    src_padding_mask = (src == tokenizer_en.added_tokens_encoder['<pad>'])
    tgt_padding_mask = (tgt == tokenizer_vi.added_tokens_encoder['<pad>'])
    return mask_encoder, mask_decoder,src_padding_mask, tgt_padding_mask

In [12]:
from transformers import PreTrainedModel, PretrainedConfig

class Seq2SeqTransformerConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size_src=13685,
        vocab_size_tgt = 13685,
        max_seq_length=50,
        d_model=256,
        num_heads=8,
        num_layers=2,
        dropout=0.1,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.vocab_size_src = vocab_size_src
        self.vocab_size_tgt = vocab_size_tgt
        self.max_seq_length = max_seq_length
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout = dropout


class Seq2SeqTransformer(PreTrainedModel):
    config_class = Seq2SeqTransformerConfig
    def __init__(self, config: Seq2SeqTransformerConfig, tokenizer_vi: PreTrainedTokenizerFast):
        super().__init__(config)
        self.embedding_src = nn.Embedding(config.vocab_size_src, config.d_model)
        self.embedding_tgt = nn.Embedding(config.vocab_size_tgt, config.d_model)

        self.position_embedding_src = nn.Embedding(
            config.max_seq_length, config.d_model
        )
        self.position_embedding_tgt = nn.Embedding(
            config.max_seq_length, config.d_model
        )
        
        self.transformer = nn.Transformer(
            d_model=config.d_model,
            nhead=config.num_heads,
            num_decoder_layers=config.num_layers,
            num_encoder_layers=config.num_layers,
            dropout=config.dropout,
            batch_first=True
        ) # --> [B, Seq_length, E]

        self.classifier = nn.Linear(
            config.d_model, config.vocab_size_tgt
        )

        self.loss_fn = nn.CrossEntropyLoss(
            ignore_index=tokenizer_vi.added_tokens_encoder['<pad>'])

    def forward(self, input_ids, labels):
        # teacher forcing
        tgt_input = labels[:, :-1]
        tgt_output = labels[:, 1:]

        batch_size , seq_len_src = input_ids.shape
        _, seq_len_tgt = tgt_input.shape

        src_positions = torch.arange(seq_len_src, device=input_ids.device).unsqueeze(0)
        tgt_positions = torch.arange(
            seq_len_tgt, device=tgt_input.device).unsqueeze(0)

        src_embedded = self.embedding_src(input_ids) + self.position_embedding_src(src_positions) # broad casting để cộng lại
        tgt_embedded = self.embedding_tgt(tgt_input) + self.position_embedding_tgt(tgt_positions)
        src_mask, tgt_mask, src_key_padding_mask, tgt_key_padding_mask = create_mask(
            input_ids, tgt_input, tokenizer_en, tokenizer_vi, device=input_ids.device)

        outs = self.transformer(
            src_embedded, tgt_embedded, src_mask, tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

        logits = self.classifier(outs)
        loss = self.loss_fn(logits.permute(0, 2, 1), tgt_output)

        return {"loss": loss, "logits": logits}
    
    def encoder(self, src, src_mask):
        _, seq_len_src = src.shape
        src_positions = torch.arange(seq_len_src, device=src.device).unsqueeze(0)
        src_embedded = self.embedding_src(src) + self.position_embedding_src(src_positions)
        return self.transformer.encoder(src_embedded, src_mask)

    def decoder(self, tgt, encoder_output, tgt_mask):
        _, seq_len_tgt = tgt.shape
        tgt_positions = torch.arange(seq_len_tgt, device=tgt.device).unsqueeze(0)
        tgt_embedded = self.embedding_tgt(tgt) + self.position_embedding_tgt(tgt_positions)
        return self.transformer.decoder(tgt_embedded, encoder_output, tgt_mask)
    

In [13]:
# config = Seq2SeqTransformerConfig(vocab_size_src, vocab_size_tgt, MAX_LENGTH)
# model = Seq2SeqTransformer(config, tokenizer_vi)
# pred = model.forward(preprocessed_train[0: 1]['input_ids'],
#               preprocessed_train[0: 1]['labels'])
# pred["logits"].shape

In [14]:
vocab_size_src = len(tokenizer_en.get_vocab())
vocab_size_tgt = len(tokenizer_vi.get_vocab())
vocab_size_tgt

13685

In [15]:
config = Seq2SeqTransformerConfig(
    vocab_size_src=vocab_size_src,
    vocab_size_tgt=vocab_size_tgt,
    max_seq_length=MAX_LENGTH
)
model = Seq2SeqTransformer(config, tokenizer_vi)

In [16]:
# Disable wandb
from transformers import Trainer, TrainingArguments
import os
os.environ['WANDB_DISABLED'] = 'true'

# Training
training_args = TrainingArguments(
    output_dir="/kaggle/working/en-vi-machine-translation",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    num_train_epochs=100,
    learning_rate=2e-5,
    save_total_limit=1,
    report_to="none",
    logging_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_train,
    eval_dataset=preprocessed_val
)
trainer.train()



Epoch,Training Loss,Validation Loss
1,7.9784,6.862296
2,6.2827,5.557428
3,5.0079,4.191047
4,3.8292,3.112408
5,2.9232,2.329579
6,2.2531,1.77582
7,1.7674,1.385724
8,1.4169,1.109539
9,1.1621,0.911126
10,0.9729,0.763206




TrainOutput(global_step=26100, training_loss=0.46446370037122703, metrics={'train_runtime': 18047.2038, 'train_samples_per_second': 738.713, 'train_steps_per_second': 1.446, 'total_flos': 5.582746229800486e+16, 'train_loss': 0.46446370037122703, 'epoch': 100.0})

In [17]:
! pip install sacrebleu==2.5.1

Collecting sacrebleu==2.5.1
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu==2.5.1)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.1.1 sacrebleu-2.5.1


## Inferences

In [18]:
def greedy_decode(model, src, src_mask, max_len, start_symbol, device="cpu"):
    src = src.to(device)
    src_mask = src_mask.to(device)
    # Encoder tạo memory
    memory = model.encoder(src, src_mask)
    # Bắt đầu câu đầu ra với token start_symbol
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len - 1):
        memory = memory.to(device)
        tgt_mask = generate_square_subsequent_mask(ys.size(1), device).type(torch.bool).to(device)
        # Decoder dự đoán token tiếp theo
        out = model.decoder(ys, memory, tgt_mask)
        prob = model.classifier(out[:, -1, :])  # LM Head
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()  # Lấy index của token dự đoán
        # Nối token mới vào chuỗi đầu ra
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        # Dừng nếu gặp token <eos>
        if next_word == 3:  # EOS token ID là 3
            break
    return ys


def translate(model, src_sentence, device):
    model.eval()
    # Tokenize input sentence
    input_ids = tokenizer_en([src_sentence], return_tensors="pt")["input_ids"].to(device)
    num_tokens = input_ids.shape[1]
    # Tạo mask cho encoder
    src_mask = torch.zeros((num_tokens, num_tokens)).type(torch.bool).to(device)
    # Thực hiện greedy decoding
    tgt_tokens = greedy_decode(model, input_ids, src_mask, max_len=num_tokens + 5, start_symbol=2, device=device)

    # Convert token ID thành chuỗi
    return tokenizer_vi.decode(tgt_tokens.detach().cpu()[0])

# Test example
out = translate(model, "i go to school", model.device)  # Expected output: "tôi đến trường"
print(out)

# Evaluate on test set
from tqdm import tqdm
import sacrebleu

pred_sentences, tgt_sentences = [], []

for sample in tqdm(ds["test"].select(range(100))):
    src_sentence = sample["en"]
    tgt_sentence = sample["vi"]

    pred_sentence = translate(model, src_sentence, device=model.device)
    pred_sentences.append(pred_sentence)
    tgt_sentences.append(tgt_sentence)

# Compute BLEU score
bleu_score = sacrebleu.corpus_bleu(pred_sentences, [tgt_sentences], force=True)
print("BLEU Score:", bleu_score.score)

<bos> cậu cậu cậu cậu cậu cậu cậu cậu


100%|██████████| 100/100 [00:08<00:00, 12.09it/s]

BLEU Score: 0.02541372002153892



