In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, BitsAndBytesConfig, DataCollatorForSeq2Seq
from datasets import Dataset, concatenate_datasets, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType
from peft import prepare_model_for_kbit_training
import torch

import numpy as np
import pandas as pd
import os
import random
BASEPATH = os.path.dirname(os.getcwd())
DATASETPATH = os.path.join(BASEPATH,"datasets")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datasetTrain = pd.read_csv(f"{DATASETPATH}/train.csv")
datasetTest = pd.read_csv(f"{DATASETPATH}/test.csv")
datasetValidation = pd.read_csv(f"{DATASETPATH}/validation.csv")

datasetTrain = Dataset.from_pandas(datasetTrain)
datasetTest = Dataset.from_pandas(datasetTest)
datasetValidation = Dataset.from_pandas(datasetValidation)

datasetTrain = concatenate_datasets([datasetTrain, datasetTest])

In [3]:
datasetFull = DatasetDict(
    {
        "train" : datasetTrain,
        "test" : datasetTest
    }
)
datasetFull

DatasetDict({
    train: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 298603
    })
    test: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 11490
    })
})

In [4]:
shuffled_train = list(range(0,298603))
shuffled_test = list(range(0,11490))

random.seed(42)
random.shuffle(shuffled_train)
random.shuffle(shuffled_train)
random.shuffle(shuffled_test)
random.shuffle(shuffled_test)

In [5]:
datasetFull["train"] = datasetFull["train"].select(shuffled_train[:5000])
datasetFull["test"] = datasetFull["test"].select(shuffled_test[:1100])

## 1. T5 Model

In [6]:
model_name = "google/flan-t5-large"

In [7]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

In [8]:
model_tokenizer = T5Tokenizer.from_pretrained(model_name)
model_base = T5ForConditionalGeneration.from_pretrained(model_name, quantization_config = quantization_config, device_map = "auto", torch_dtype = torch.bfloat16)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
`torch_dtype` is deprecated! Use `dtype` instead!
Exception in thread Thread-4 (_readerthread):
Traceback (most recent call last):
  File "d:\Conda\envs\rag_learning\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "d:\Conda\envs\rag_learning\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "d:\Conda\envs\rag_learning\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "d:\Conda\envs\rag_learning\lib\codecs.py", line 322, in decode
    (re

In [9]:
model_base.gradient_checkpointing_enable()
model_base = prepare_model_for_kbit_training(model_base)

In [10]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v"], 
    task_type=TaskType.SEQ_2_SEQ_LM, 
    bias="none"
)
model_base = get_peft_model(model_base, lora_config)

In [11]:
model_base.enable_input_require_grads()

In [12]:
def tokenization(batch) : 
    question = [f"summary : {q}" for q in batch["article"]]
    hasil_tokenized = model_tokenizer(question, padding = "max_length", truncation = True, max_length = 512)
    labels = model_tokenizer(batch["highlights"], padding = "max_length", truncation = True, max_length = 256)
    ignore_pad = []
    for lab in labels["input_ids"] :
        ignore_pad.append([]) 
        for num in lab : 
            if num != model_tokenizer.pad_token_id : 
                ignore_pad[-1].append(num)
            else : 
                ignore_pad[-1].append(-100)
    hasil_tokenized["labels"] = ignore_pad
    return hasil_tokenized

datasetFull = datasetFull.map(tokenization, batched = True, batch_size = 16)

Map: 100%|██████████| 5000/5000 [00:22<00:00, 226.31 examples/s]
Map: 100%|██████████| 1100/1100 [00:05<00:00, 200.54 examples/s]


In [13]:
trainingArgs = Seq2SeqTrainingArguments(
    output_dir="./hasil-training",
    per_device_train_batch_size=16,       # Jauh lebih aman untuk 8GB
    per_device_eval_batch_size=8,
    learning_rate=2e-4,                
    save_strategy="steps",
    eval_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_rougeL", 
    greater_is_better=True,             
    optim="paged_adamw_8bit",          
    gradient_checkpointing=True,     
    predict_with_generate=True,         
    report_to="none"
)

In [14]:
import evaluate
import numpy as np

# Load metrik ROUGE
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # Jika preds adalah tuple, ambil elemen pertama
    if isinstance(preds, tuple):
        preds = preds[0]
        
    # Decode prediksi menjadi teks
    decoded_preds = model_tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Ganti -100 pada label agar bisa di-decode (kembalikan ke pad_token_id)
    labels = np.where(labels != -100, labels, model_tokenizer.pad_token_id)
    decoded_labels = model_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE butuh newline setelah setiap kalimat untuk skor yang lebih akurat
    decoded_preds = ["\n".join(p.strip().split()) for p in decoded_preds]
    decoded_labels = ["\n".join(l.strip().split()) for l in decoded_labels]

    # Hitung skor ROUGE
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # Ambil skor dalam persentase (0-100)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    
    return result

In [15]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=model_tokenizer,
    model=model_base,
    label_pad_token_id=-100,
    pad_to_multiple_of=8 # Optimasi untuk GPU Tensor Cores
)

In [16]:
trainer = Seq2SeqTrainer(
    model=model_base,
    args=trainingArgs, # Menggunakan TrainingArguments yang sudah kita bahas sebelumnya
    train_dataset=datasetFull["train"],
    eval_dataset=datasetFull["test"],
    tokenizer=model_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [17]:
trainer.train()
trainer.evaluate()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
500,1.8498,1.705168,25.412,11.952,20.7921,25.4127


{'eval_loss': 1.7051680088043213,
 'eval_rouge1': 25.412,
 'eval_rouge2': 11.952,
 'eval_rougeL': 20.7921,
 'eval_rougeLsum': 25.4127,
 'eval_runtime': 415.8947,
 'eval_samples_per_second': 2.645,
 'eval_steps_per_second': 0.332,
 'epoch': 3.0}

In [None]:
trainer.save_model(f"{BASEPATH}/modelAbstractive")

In [7]:
from transformers import pipeline

pipe_real = pipeline(task = "summarization", model = model_name)
# pipe_trained = pipeline(task = "summarization", model = f"{BASEPATH}/model")

Device set to use cuda:0


In [8]:
# =====================================================================================
# BEFORE DOING FINE TUNING : 
# ===================================================================================== 

jawaban_real = []
jawaban_predict = []

for batch in range(0,1016,16) : 
    q = datasetFull["test"]["article"][batch:batch+16]
    hasil = pipe_real(q, truncation = True)
    for i in hasil : 
        jawaban_predict.append(i["summary_text"])
    jawaban_real.extend(datasetFull["test"]["highlights"][batch:batch+16])
    break

import evaluate 

rouge_score = evaluate.load("rouge")
score_for_rouge = rouge_score.compute(predictions = jawaban_predict, references = jawaban_real)

bertscore = evaluate.load("bertscore")
score_for_bert = bertscore.compute(predictions = jawaban_predict, references=jawaban_real, lang = "en")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
score_for_rouge

{'rouge1': np.float64(0.26819049749228985),
 'rouge2': np.float64(0.12966378944930246),
 'rougeL': np.float64(0.19755372911598235),
 'rougeLsum': np.float64(0.24546768159373245)}

In [12]:
print(f"Bert Score : {round(float(np.mean(score_for_bert['f1'])),3) * 100}%")

Bert Score : 85.6%


In [None]:
# =====================================================================================
# AFTER DOING FINE TUNING : 
# ===================================================================================== 

jawaban_real = []
jawaban_predict = []

for batch in range(0,1016,16) : 
    q = datasetFull["test"]["article"][batch:batch+16]
    hasil = pipe_trained(q, truncation = True)
    for i in hasil : 
        jawaban_predict.append(i["summary_text"])
    jawaban_real.extend(datasetFull["test"]["highlights"][batch:batch+16])
    break

import evaluate 

rouge_score = evaluate.load("rouge")
score_for_rouge = rouge_score.compute(predictions = jawaban_predict, references = jawaban_real)

bertscore = evaluate.load("bertscore")
score_for_bert = bertscore.compute(predictions = jawaban_predict, references=jawaban_real, lang = "en")

In [None]:
score_for_rouge

{'rouge1': np.float64(0.3858755621784705),
 'rouge2': np.float64(0.17576210771150474),
 'rougeL': np.float64(0.2826192216294863),
 'rougeLsum': np.float64(0.32857307755883425)}

In [None]:
print(f"Bert Score : {round(float(np.mean(score_for_bert['f1'])),3) * 100}%")

Bert Score : 88.0%
