## Import libary và load datasets

In [1]:
import os
from tokenizers import Tokenizer, pre_tokenizers, trainers, models
from datasets import load_dataset
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [2]:
ds = load_dataset("thainq107/iwslt2015-en-vi")

README.md:   0%|          | 0.00/522 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133317 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [4]:
ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## Encoding

### Defined toknize

In [5]:
# Word-based Tokenizer
tokenizer_en = Tokenizer(models.WordLevel(unk_token="<unk>")) # Tạo 2 tokenizer
tokenizer_vi = Tokenizer(models.WordLevel(unk_token="<unk>"))
tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace() # Tách dựa trên khoảng trắng
tokenizer_vi.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.WordLevelTrainer(
    vocab_size=15000, 
    min_frequency=2,  # xác định tần suất xuất hiện tối thiểu để được đưa vào vocab
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)

# Train tokenizer
tokenizer_en.train_from_iterator(ds["train"]["en"], trainer) # Vừa xây dựng vocab vừa tokenize
tokenizer_vi.train_from_iterator(ds["train"]["vi"], trainer)

# Save tokenizer
tokenizer_en.save("tokenizer_en.json")
tokenizer_vi.save("tokenizer_vi.json")

In [6]:
vocab_size_src = len(tokenizer_en.get_vocab())
vocab_size_tgt = len(tokenizer_vi.get_vocab())

### Encoding

In [7]:
MAX_LENGTH = 75
from transformers import  PreTrainedTokenizerFast
tokenizer_en = PreTrainedTokenizerFast(tokenizer_file="tokenizer_en.json", unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos")
tokenizer_vi = PreTrainedTokenizerFast(tokenizer_file="tokenizer_vi.json",
                                       unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [8]:
added_tokens_encoder = tokenizer_en.added_tokens_encoder
added_tokens_encoder['<pad>']

0

In [9]:
def preprocess_function(examples):
    src_text = examples["en"]
    tgt_text = ["<bos> " + text + " <eos>" for text in examples["vi"]]

    src_encodings = tokenizer_en(
        src_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    tgt_encodings = tokenizer_vi(
        tgt_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)

    return {
        # Trả về list, không phải tensor
        "input_ids": src_encodings["input_ids"],
        "labels": tgt_encodings["input_ids"],  # Trả về list, không phải tensor
    }


# Áp dụng map() và set_format("torch") để tự động chuyển thành tensor
preprocessed_train = ds['train'].map(preprocess_function, batched=True)
preprocessed_val = ds['validation'].map(preprocess_function, batched=True)
preprocessed_test = ds['test'].map(preprocess_function, batched=True)

preprocessed_train = preprocessed_train.remove_columns(["en", "vi"])
preprocessed_val = preprocessed_val.remove_columns(["en", "vi"])
preprocessed_test = preprocessed_test.remove_columns(["en", "vi"])

# Định dạng dữ liệu dưới dạng torch.Tensor
preprocessed_train.set_format(type="torch", columns=["input_ids", "labels"])
preprocessed_val.set_format(type="torch", columns=["input_ids", "labels"])
preprocessed_test.set_format(type="torch", columns=["input_ids", "labels"])

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [10]:
# Kiểm tra đầu ra
print(preprocessed_train[0])
print(type(preprocessed_train[0]["input_ids"]))  # Phải là torch.Tensor

{'input_ids': tensor([6675,    1,   57,   60,  339,  604,   13,  744, 5643,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]), 'labels': tensor([   2, 1960,   66, 1157,  131,    8,  376,  113,   38,  417,  735,    3,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 

## Modeling

### RNNs

In [11]:
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig

class Seq2SeqRNNConfig(PretrainedConfig):
    def __init__(self, vocab_size_src = 10000, vocab_size_tgt = 10000, embedding_dim = 128, hidden_size = 128, dropout = 0.1, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size_src = vocab_size_src
        self.vocab_size_tgt = vocab_size_tgt
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.dropout = dropout


class EncoderRNN(nn.Module):
    def __init__(self, input_size=10000, embedding_dim=128, hidden_size=128, dropout=0.1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True, dropout=0.0)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded)
        return output, hidden
    
class DecoderRNN(nn.Module):
    def __init__(self, output_size=10000, embedding_dim=128, hidden_size=128, dropout=0.1):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True, dropout=0.0)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden_decoder):
        embedded = self.embedding(x)
        output, last_hidden = self.gru(embedded, hidden_decoder)
        output = self.out(output)
        return output, last_hidden
    
class Seq2SeqRNNmodel(PreTrainedModel):
    def __init__(self, config: Seq2SeqRNNConfig, tokenizer_en: PreTrainedTokenizerFast):
        super().__init__(config)
        self.encoder = EncoderRNN(config.vocab_size_src, config.embedding_dim, config.hidden_size, config.dropout)
        self.decoder = DecoderRNN(config.vocab_size_tgt, config.embedding_dim, config.hidden_size, config.dropout)
        self.bos_idx = tokenizer_en.added_tokens_encoder['<bos>']
        self.loss_fn = nn.CrossEntropyLoss(
            ignore_index=tokenizer_en.added_tokens_encoder['<pad>'])
        
    def forward(self, input_ids, labels):
        labels = labels.to(torch.long)
        
        batch_size, seq_len  = labels.size()
        encoder_output, encoder_hidden = self.encoder(input_ids)
        # print(encoder_output.shape, encoder_hidden.shape)

        inputs_decoder = torch.full((batch_size, 1), self.bos_idx, dtype=torch.long).to(input_ids.device) # BOS token
        hidden_decoder = encoder_hidden
        output_decoder_lst = []

        # teacher forcing
        for i in range(seq_len):
            output_decoder, hidden_decoder = self.decoder(inputs_decoder, hidden_decoder)
            # print(output_decoder.shape, hidden_decoder.shape)
            inputs_decoder = labels[:, i].unsqueeze(1)
            output_decoder_lst.append(output_decoder)

        # print(len(output_decoder_lst), output_decoder_lst[0].shape)
        logits = torch.cat(output_decoder_lst, dim=1) # (batch_size, seq_len, vocab_size_tgt)
        loss = self.loss_fn(logits.permute(0, 2, 1), labels)
        
        return {"loss": loss, "logits": logits}

In [12]:
# config = Seq2SeqRNNConfig()
# model = Seq2SeqRNNmodel(config, tokenizer_en)
# model.forward(preprocessed_train[0: 3]['input_ids'],
#               preprocessed_train[0: 3]['labels'])
#{'loss': tensor(9.2388, device='cuda:0', grad_fn=<NllLoss2DBackward0>)}

In [13]:
config = Seq2SeqRNNConfig(vocab_size_src, vocab_size_tgt)
model = Seq2SeqRNNmodel(config, tokenizer_en)

In [14]:
# Disable wandb
from transformers import Trainer, TrainingArguments
import os
os.environ['WANDB_DISABLED'] = 'true'

# Training
training_args = TrainingArguments(
    output_dir="/kaggle/working/en-vi-machine-translation",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    num_train_epochs=50,
    learning_rate=2e-5,
    save_total_limit=1,
    report_to="none",
    logging_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_train,
    eval_dataset=preprocessed_val
)
trainer.train()



Epoch,Training Loss,Validation Loss
1,9.3848,9.207369
2,8.5569,7.537413
3,6.9975,6.658733
4,6.4701,6.342468
5,6.2544,6.197031
6,6.148,6.119614
7,6.0874,6.072582
8,6.0476,6.038831
9,6.0163,6.010701
10,5.9879,5.984472




TrainOutput(global_step=13050, training_loss=5.794143898916428, metrics={'train_runtime': 9330.3581, 'train_samples_per_second': 714.426, 'train_steps_per_second': 1.399, 'total_flos': 5889418457850000.0, 'train_loss': 5.794143898916428, 'epoch': 50.0})

In [15]:
! pip install sacrebleu==2.5.1

Collecting sacrebleu==2.5.1
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu==2.5.1)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.1.1 sacrebleu-2.5.1


In [16]:
import sacrebleu

def greedy_decode(model, src, max_len, tokenizer, device="cpu"):
    src = torch.tensor(src).unsqueeze(0).to(device)
    memory, hidden = model.encoder(src)
    y_start = torch.full((1, 1), tokenizer.added_tokens_encoder["<bos>"], dtype=torch.long).to(device)

    for i in range(max_len):
        output_decoder, hidden = model.decoder(y_start, hidden)
        _, next_word = output_decoder.max(dim=-1)
        y_start = torch.cat([y_start, next_word[:, -1].unsqueeze(1)], dim=1)  # Fix lỗi lặp từ

        if next_word[:, -1].squeeze().item() == tokenizer.added_tokens_encoder["<eos>"]:
            break
    return y_start.tolist()





# Test
def translate():
    model.eval()
    src = preprocessed_test[0]["input_ids"]
    tgt = preprocessed_test[0]["labels"]
    output = greedy_decode(model, src, len(src) + 1,
                           tokenizer_vi, device="cuda")
    decode_out = tokenizer_vi.decode(output[0])
    print("Input:", tokenizer_en.decode(src))
    print("Target:", tokenizer_vi.decode(tgt))
    print("Predict:", decode_out)

    bleu_score = sacrebleu.corpus_bleu(
        [decode_out], [[tokenizer_vi.decode(tgt)]], force=True).score
    print("BLEU Score:", bleu_score)

translate()

Input: When I was little , I thought my country was the best on the planet , and I grew up singing a song called & quot ; Nothing To <unk> . & quot ; <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Target: <bos> Khi tôi còn nhỏ , Tôi nghĩ rằng <unk> Tiên là đất nước tốt nhất trên thế giới và tôi thường hát bài & quot ; Chúng ta chẳng có gì phải ghen tị . & quot ; <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Predict: <bos> <bos> Và tôi đã , và tôi đã . <eos>
BLEU Score: 0.007253777235634502


  src = torch.tensor(src).unsqueeze(0).to(device)
