In [1]:
!pip install rouge-score
!pip install underthesea
!pip install transformers[torch] datasets evaluate sentencepiece accelerate nltk

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4b1743e0cc44284f5128164ff35d68628064592016f4031346149b49aa12af95
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_6

In [3]:
from datasets import load_dataset, DatasetDict
from underthesea import text_normalize
from rouge_score import rouge_scorer
import re
import torch
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM,DataCollatorForSeq2Seq,Seq2SeqTrainingArguments,Seq2SeqTrainer

import numpy as np

# Load Dataset

In [4]:
ds = load_dataset("vietgpt/news_summarization_vi")
ds

README.md: 0.00B [00:00, ?B/s]

(…)-00000-of-00001-7f6f00607f418ae2.parquet:   0%|          | 0.00/115M [00:00<?, ?B/s]

(…)-00000-of-00001-5f6b579a81bd695a.parquet:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/65361 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content', 'summary'],
        num_rows: 65361
    })
    test: Dataset({
        features: ['content', 'summary'],
        num_rows: 1000
    })
})

In [6]:
train_val_split = ds['train'].train_test_split(
    test_size=0.1, 
    seed=42        
)

final_ds = DatasetDict({
    'train': train_val_split['train'],      
    'validation': train_val_split['test'],  
    'test': ds['test']                      
})
print(final_ds)


DatasetDict({
    train: Dataset({
        features: ['content', 'summary'],
        num_rows: 58824
    })
    validation: Dataset({
        features: ['content', 'summary'],
        num_rows: 6537
    })
    test: Dataset({
        features: ['content', 'summary'],
        num_rows: 1000
    })
})


# Preprocessing

In [5]:
final_ds = ds

In [6]:
def is_not_empty(example):
    """Kiểm tra xem các cột cần thiết có dữ liệu hợp lệ hay không."""
    content_ok = example['content'] and example['content'].strip()
    summary_ok = example['summary'] and example['summary'].strip()
    return content_ok and summary_ok

cleaned_ds = final_ds.filter(
    is_not_empty,
    num_proc=2
)

for split in final_ds.keys():
    original_rows = len(final_ds[split])
    cleaned_rows = len(cleaned_ds[split])
    removed_rows = original_rows - cleaned_rows
    print(f"\nTrong tập '{split}':")
    print(f"  - Số hàng ban đầu: {original_rows}")
    print(f"  - Số hàng sau khi lọc: {cleaned_rows}")
    print(f"  - Số hàng đã loại bỏ: {removed_rows}")

Filter (num_proc=2):   0%|          | 0/65361 [00:00<?, ? examples/s]

Filter (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]


Trong tập 'train':
  - Số hàng ban đầu: 65361
  - Số hàng sau khi lọc: 64342
  - Số hàng đã loại bỏ: 1019

Trong tập 'test':
  - Số hàng ban đầu: 1000
  - Số hàng sau khi lọc: 981
  - Số hàng đã loại bỏ: 19


In [7]:
def preprocess_function(batch):
    """Áp dụng clean_text cho cả cột content và summary trong một batch."""
    
    # batch['content'] là một list các chuỗi content
    # batch['summary'] là một list các chuỗi summary
    
    cleaned_contents = [clean_text(text) for text in batch['content']]
    cleaned_summaries = [clean_text(text) for text in batch['summary']]
    
    # Trả về một dictionary với các cột đã được cập nhật
    batch['content'] = cleaned_contents
    batch['summary'] = cleaned_summaries
    
    return batch

In [8]:
def clean_text(text):
    text = text_normalize(text)
    text = re.sub(r"[^\w\s.,:;!?()/()-]", "", text)
    text = re.sub(r"([.,:;!?)])(\S)", r"\1 \2", text)
    text = re.sub(r"\s+", " ", text).strip()
    
    def capitalize_after_punctuation(text):
        sentences = re.split(r"([.!?])", text)
        result = ""
        for i in range(len(sentences)):
            if sentences[i] in [".", "!", "?"] and i + 1 < len(sentences):
                sentences[i + 1] = sentences[i + 1].strip().capitalize()
            result += sentences[i]
        return result.strip()
    
    return capitalize_after_punctuation(text)

In [9]:
cleaned_ds = cleaned_ds.map(
    preprocess_function, 
    batched=True,
    num_proc=2
)

Map (num_proc=2):   0%|          | 0/64342 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/981 [00:00<?, ? examples/s]

# Fine-tuning

In [12]:
def compute_metrics(eval_pred, tokenizer):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1) if predictions.ndim == 3 else predictions

    # Kiểm tra nếu có giá trị âm trong predictions
    for batch in predictions:
        if any(token < 0 for token in batch.flatten()):
            print("Có giá trị âm trong predictions:", batch)
            break

    # Loại bỏ giá trị âm trước khi decode
    predictions = [[token for token in pred if token >= 0] for pred in predictions]
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Sử dụng rouge_scorer để tính ROUGE
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = [scorer.score(pred, label) for pred, label in zip(decoded_preds, decoded_labels)]

    # Trung bình các giá trị ROUGE
    rouge1 = sum(score["rouge1"].fmeasure for score in scores) / len(scores)
    rouge2 = sum(score["rouge2"].fmeasure for score in scores) / len(scores)
    rougeL = sum(score["rougeL"].fmeasure for score in scores) / len(scores)

    return {
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
    }

In [10]:
# Load tokenizer và model
model_name = "VietAI/vit5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/904M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/904M [00:00<?, ?B/s]

In [11]:
# Tokenize dataset
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["content"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
tokenized_dataset = cleaned_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/64342 [00:00<?, ? examples/s]

Map:   0%|          | 0/981 [00:00<?, ? examples/s]

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id, return_tensors="pt")

In [14]:
print(torch.cuda.device_count()) 

2


In [15]:
save_path = "/kaggle/working/"

In [42]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=save_path,
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    eval_strategy='epoch',
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True,
    logging_first_step=True
)

In [24]:
training_args = Seq2SeqTrainingArguments(output_dir = save_path,
                                      do_train=True,
                                      do_eval=False,
                                      num_train_epochs=3,
                                      learning_rate=5e-5,
                                      warmup_ratio=0.05,
                                      weight_decay=0.01,
                                      per_device_train_batch_size=4,
                                      per_device_eval_batch_size=4,
                                      group_by_length=True,
                                      save_strategy = "no",
                                      # gradient_accumulation_steps=4,
                                      fp16=True,
                                      # gradient_checkpointing=True,
                                      )

In [27]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    data_collator=data_collator,
)

In [28]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,2.198
1000,0.9167
1500,0.8596
2000,0.8347
2500,0.8094
3000,0.7996
3500,0.7809
4000,0.7672
4500,0.7374
5000,0.763


TrainOutput(global_step=24129, training_loss=0.6718259308102899, metrics={'train_runtime': 24336.3425, 'train_samples_per_second': 7.932, 'train_steps_per_second': 0.991, 'total_flos': 1.1754470578323456e+17, 'train_loss': 0.6718259308102899, 'epoch': 3.0})

In [37]:
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Đã lưu xong ở đây này > {save_path}")

Đã lưu xong ở đây này > /kaggle/working/


# Evaluation

In [38]:
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/ViT5-finetune")
model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [57]:
test_tokenized_datasets= tokenized_dataset['test']
test_tokenized_datasets

Dataset({
    features: ['content', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 981
})

In [58]:
columns_to_remove = ['content', 'summary']
test_tokenized_datasets = test_tokenized_datasets.remove_columns(columns_to_remove)
print(test_tokenized_datasets)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 981
})


In [59]:
import torch 
import numpy as np
from evaluate import load
import tqdm
from tqdm import tqdm
metrics = load("rouge")

max_target_length = 128
dataloader = torch.utils.data.DataLoader(test_tokenized_datasets, collate_fn=data_collator, batch_size=32)


for i, batch in enumerate(tqdm(dataloader)):
  outputs = model.generate(
      input_ids=batch['input_ids'].to('cuda'),
      max_length=max_target_length,
      attention_mask=batch['attention_mask'].to('cuda'),
  )
  with tokenizer.as_target_tokenizer():
    outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

    labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
    actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
  predictions.extend(outputs)
  references.extend(actuals)
  metrics.add_batch(predictions=outputs, references=actuals)


metrics.compute()


100%|██████████| 31/31 [02:40<00:00,  5.18s/it]


{'rouge1': 0.7619180229245577,
 'rouge2': 0.5433097205308821,
 'rougeL': 0.5618782374224933,
 'rougeLsum': 0.5615724955255126}