In [1]:
# Cài đặt thư viện
!pip install -q transformers datasets evaluate sacrebleu rouge-score accelerate

# Mount Drive
from google.colab import drive
import os

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

PROJECT_DIR = "/content/drive/MyDrive/NLP_New"
print(f" Working at: {PROJECT_DIR}")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Mounted at /content/drive
 Working at: /content/drive/MyDrive/NLP_New


In [None]:
#@title Load data gốc
import pandas as pd

DATA_PATH = "/content/drive/MyDrive/IWSLT2015_dataset"

def load_parallel(en_path, vi_path):
    with open(en_path, "r", encoding="utf-8") as f_en, \
         open(vi_path, "r", encoding="utf-8") as f_vi:
        en_lines = [l.strip() for l in f_en]
        vi_lines = [l.strip() for l in f_vi]

    assert len(en_lines) == len(vi_lines), "Số dòng en/vi không khớp!"
    return pd.DataFrame({"en": en_lines, "vi": vi_lines})

train_df = load_parallel(
    f"{DATA_PATH}/train-en-vi/train.en",
    f"{DATA_PATH}/train-en-vi/train.vi",
)

dev_df = load_parallel(
    f"{DATA_PATH}/dev-2012-en-vi/tst2012.en",
    f"{DATA_PATH}/dev-2012-en-vi/tst2012.vi",
)

test_df = load_parallel(
    f"{DATA_PATH}/test-2013-en-vi/tst2013.en",
    f"{DATA_PATH}/test-2013-en-vi/tst2013.vi",
)

print(f" Số lượng mẫu:")
print(f"  Train: {len(train_df):,} | Dev: {len(dev_df)} | Test: {len(test_df)}")
display(train_df.head())

 Số lượng mẫu:
  Train: 133,317 | Dev: 1553 | Test: 1268


Unnamed: 0,en,vi
0,Rachel Pike : The science behind a climate hea...,Khoa học đằng sau một tiêu đề về khí hậu
1,"In 4 minutes , atmospheric chemist Rachel Pike...","Trong 4 phút , chuyên gia hoá học khí quyển Ra..."
2,I &apos;d like to talk to you today about the ...,Tôi muốn cho các bạn biết về sự to lớn của nhữ...
3,Headlines that look like this when they have t...,Có những dòng trông như thế này khi bàn về biế...
4,They are both two branches of the same field o...,Cả hai đều là một nhánh của cùng một lĩnh vực ...


In [None]:
#@title Làm sạch dữ liệu (Preprocessing)
import html
import re
import pandas as pd

# Hàm làm sạch chuẩn
def preprocess_text(text):
    if not isinstance(text, str): return ""
    text = html.unescape(text)                  # Giải mã &quot;, &apos;, ...
    text = text.replace("\n", " ")              # Bỏ xuống dòng
    text = re.sub(r'\s+', ' ', text).strip()    # Xóa khoảng trắng thừa
    return text

print(" Đang làm sạch dữ liệu...")
# Áp dụng cho cả 3 tập (Giả định bạn đã load train_df, dev_df, test_df từ trước)
train_df['en'] = train_df['en'].apply(preprocess_text)
train_df['vi'] = train_df['vi'].apply(preprocess_text)

dev_df['en'] = dev_df['en'].apply(preprocess_text)
dev_df['vi'] = dev_df['vi'].apply(preprocess_text)

test_df['en'] = test_df['en'].apply(preprocess_text)
test_df['vi'] = test_df['vi'].apply(preprocess_text)

print(" Đã làm sạch xong! Mẫu dữ liệu sau khi clean:")
display(train_df.head(3))

 Đang làm sạch dữ liệu...
 Đã làm sạch xong! Mẫu dữ liệu sau khi clean:


Unnamed: 0,en,vi
0,Rachel Pike : The science behind a climate hea...,Khoa học đằng sau một tiêu đề về khí hậu
1,"In 4 minutes , atmospheric chemist Rachel Pike...","Trong 4 phút , chuyên gia hoá học khí quyển Ra..."
2,I 'd like to talk to you today about the scale...,Tôi muốn cho các bạn biết về sự to lớn của nhữ...


In [4]:
#@title  Tokenize
from transformers import T5Tokenizer
from datasets import Dataset

# Load Tokenizer
MODEL_NAME = "VietAI/vit5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

# ViT5 format: "en: <text>" -> "vi: <text>"
def format_example(en_text, vi_text):
    en_text = en_text.replace('\n', ' ').strip()
    vi_text = vi_text.replace('\n', ' ').strip()

    input_text = f"en: {en_text}"
    target_text = f"vi: {vi_text}"
    return input_text, target_text

def create_hf_dataset(df):
    inputs, targets = [], []
    for _, row in df.iterrows():
        inp, tgt = format_example(row['en'], row['vi'])
        inputs.append(inp)
        targets.append(tgt)

    return Dataset.from_dict({
        'input_text': inputs,
        'target_text': targets
    })

train_ds = create_hf_dataset(train_df)
dev_ds = create_hf_dataset(dev_df)
test_ds = create_hf_dataset(test_df)

# Tokenize function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples['input_text'],
        max_length=128,
        truncation=True,
        padding='max_length'
    )

    # Tokenize target
    labels = tokenizer(
        examples['target_text'],
        max_length=128,
        truncation=True,
        padding='max_length'
    )
    model_inputs['labels'] = labels['input_ids']

    # T5: labels cần set -100 cho padding token
    model_inputs['labels'] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_example]
        for label_example in model_inputs['labels']
    ]

    return model_inputs

# Tokenize
tokenized_train_vit5 = train_ds.map(tokenize_function, batched=True)
tokenized_dev_vit5   = dev_ds.map(tokenize_function, batched=True)
tokenized_test_vit5  = test_ds.map(tokenize_function, batched=True)
print(" Tokenization hoàn tất!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map:   0%|          | 0/1553 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

 Tokenization hoàn tất!


In [5]:
tokenized_train_vit5.save_to_disk(f"{PROJECT_DIR}/tokenized_train_vit5")
tokenized_dev_vit5.save_to_disk(f"{PROJECT_DIR}/tokenized_dev_vit5")
tokenized_test_vit5.save_to_disk(f"{PROJECT_DIR}/tokenized_test_vit5")

Saving the dataset (0/1 shards):   0%|          | 0/133317 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1553 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1268 [00:00<?, ? examples/s]

In [2]:
from datasets import load_from_disk

tokenized_train_vit5 = load_from_disk(f"{PROJECT_DIR}/tokenized_train_vit5")
tokenized_dev_vit5   = load_from_disk(f"{PROJECT_DIR}/tokenized_dev_vit5")
tokenized_test_vit5  = load_from_disk(f"{PROJECT_DIR}/tokenized_test_vit5")

print(" Đã load dataset đã mã hóa token")

import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

MODEL_NAME = "VietAI/vit5-base"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading model: {MODEL_NAME}...")
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model = model.to(device)

print(f"\n Model loaded:")
print(f"   Device: {device}")
print(f"   Params: {sum(p.numel() for p in model.parameters())/1e6:.0f}M")
print(f"   Tokenizer vocab: {len(tokenizer):,}")

 Đã load dataset đã mã hóa token
Loading model: VietAI/vit5-base...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/904M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/904M [00:00<?, ?B/s]


 Model loaded:
   Device: cuda
   Params: 226M
   Tokenizer vocab: 36,096


In [5]:
#@title Fine-tune Configuration
import torch
from transformers import Seq2SeqTrainingArguments as TrainingArguments


training_args = TrainingArguments(
    output_dir=f"{PROJECT_DIR}/vit5-orginal-checkpoint",
    overwrite_output_dir=True,

    # Hyperparameters (tối ưu thời gian)
    num_train_epochs=3,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    weight_decay=0.01,

    # Optimization
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    label_smoothing_factor=0.1,
    # Logging & Saving
    logging_steps=100,
    save_steps=1000,
    eval_strategy="steps",
    eval_steps=1000,
    save_total_limit=3,

    # Generate + metric
    predict_with_generate=True,

    # Colab / logging
    report_to="none",
    ddp_find_unused_parameters=False,

    # Mixed precision
    fp16=torch.cuda.is_available(),
    dataloader_num_workers = 2,
    dataloader_pin_memory = True,
    # Load best model theo eval_loss
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",

)

print(f" Model size: {sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters")


 Model size: 225.95M parameters


In [6]:
#@title Metrics
from evaluate import load
import numpy as np

bleu = load("bleu")
rouge = load("rouge")

pad_id = tokenizer.pad_token_id
vocab_size = tokenizer.vocab_size

def safe_decode(batch_ids):
    safe_batch = []
    for seq in batch_ids:
        # ép sang int + lọc id hợp lệ
        ids = [int(x) for x in seq]
        # thay -100 bằng pad_id (phòng khi bị lọt)
        ids = [pad_id if x == -100 else x for x in ids]
        # clamp vào [0, vocab_size-1]
        ids = [pad_id if (x < 0 or x >= vocab_size) else x for x in ids]
        safe_batch.append(ids)
    return tokenizer.batch_decode(safe_batch, skip_special_tokens=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # xử lý labels: -100 -> pad_id
    labels = np.where(labels != -100, labels, pad_id)

    pred_str  = safe_decode(predictions)
    label_str = safe_decode(labels)

    # bỏ prefix "vi: "
    pred_str  = [s.replace("vi: ", "").strip() for s in pred_str]
    label_str = [s.replace("vi: ", "").strip() for s in label_str]

    # BLEU: references = list[list[str]]
    bleu_score = bleu.compute(
        predictions=pred_str,
        references=[[ref] for ref in label_str],
        max_order=4,
    )

    # ROUGE: references = list[str]
    rouge_score = rouge.compute(
        predictions=pred_str,
        references=label_str,
        use_stemmer=False,
    )

    return {
        "bleu": bleu_score["bleu"],
        "rouge1": rouge_score["rouge1"],
        "rouge2": rouge_score["rouge2"],
        "rougeL": rouge_score["rougeL"],
    }


In [None]:
#@title Start Training (Auto Resume)
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq
import os

checkpoint_dir = f"{PROJECT_DIR}/vit5-orginal-checkpoint"
checkpoint = None

if os.path.exists(checkpoint_dir):
    checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-")]
    if checkpoints:
        latest = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
        checkpoint = os.path.join(checkpoint_dir, latest)
        print(f" Resume: {latest}")
    else:
        print(" New training")
else:
    print(" New training")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_vit5,
    eval_dataset=tokenized_dev_vit5,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print(" Training...")
trainer.train(resume_from_checkpoint=checkpoint)

save_dir = f"{PROJECT_DIR}/vit5-original-final"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

print(f" Saved: {save_dir}")

 New training
 Training...


Step,Training Loss,Validation Loss
