In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize
import nltk


  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")


In [2]:

nltk.download('punkt')

# Load dataset
dset = load_dataset("SEACrowd/liputan6", trust_remote_code=True)

# Use subsets of the dataset
train_data = dset["train"].select(range(100))
val_data = dset["validation"].select(range(20))
test_data = dset["test"].select(range(20))

# Load tokenizer and model
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:

# Preprocessing function
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize dataset
tokenized_train = train_data.map(preprocess_function, batched=True, remove_columns=train_data.column_names)
tokenized_val = val_data.map(preprocess_function, batched=True, remove_columns=val_data.column_names)
tokenized_test = test_data.map(preprocess_function, batched=True, remove_columns=test_data.column_names)


Map: 100%|██████████| 100/100 [00:00<00:00, 585.15 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 476.05 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 453.01 examples/s]


In [4]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [5]:

# Fine-tune the model
trainer.train()


  0%|          | 0/75 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
 13%|█▎        | 10/75 [01:25<09:00,  8.31s/it]

{'loss': 41.7127, 'grad_norm': 1503.1400146484375, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.4}


 27%|██▋       | 20/75 [02:48<07:31,  8.21s/it]

{'loss': 40.829, 'grad_norm': 3812.9248046875, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.8}


                                               
 33%|███▎      | 25/75 [03:36<06:32,  7.85s/it]

{'eval_loss': 22.271984100341797, 'eval_runtime': 8.7052, 'eval_samples_per_second': 2.297, 'eval_steps_per_second': 0.574, 'epoch': 1.0}


 40%|████      | 30/75 [04:41<08:23, 11.18s/it]

{'loss': 40.7342, 'grad_norm': 2222.057373046875, 'learning_rate': 1.2e-05, 'epoch': 1.2}


 53%|█████▎    | 40/75 [05:59<04:41,  8.04s/it]

{'loss': 38.6041, 'grad_norm': 1497.711669921875, 'learning_rate': 9.333333333333334e-06, 'epoch': 1.6}


 67%|██████▋   | 50/75 [07:22<03:35,  8.62s/it]

{'loss': 37.5655, 'grad_norm': 2438.293701171875, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


                                               
 67%|██████▋   | 50/75 [07:31<03:35,  8.62s/it]

{'eval_loss': 20.08449935913086, 'eval_runtime': 9.4463, 'eval_samples_per_second': 2.117, 'eval_steps_per_second': 0.529, 'epoch': 2.0}


 80%|████████  | 60/75 [09:20<02:21,  9.43s/it]

{'loss': 37.8014, 'grad_norm': 3073.8251953125, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.4}


 93%|█████████▎| 70/75 [10:52<00:45,  9.07s/it]

{'loss': 37.9038, 'grad_norm': 1303.491943359375, 'learning_rate': 1.3333333333333334e-06, 'epoch': 2.8}


                                               
100%|██████████| 75/75 [12:00<00:00,  9.00s/it]

{'eval_loss': 19.302310943603516, 'eval_runtime': 14.6585, 'eval_samples_per_second': 1.364, 'eval_steps_per_second': 0.341, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
100%|██████████| 75/75 [12:13<00:00,  9.79s/it]

{'train_runtime': 733.8765, 'train_samples_per_second': 0.409, 'train_steps_per_second': 0.102, 'train_loss': 39.19485677083333, 'epoch': 3.0}





TrainOutput(global_step=75, training_loss=39.19485677083333, metrics={'train_runtime': 733.8765, 'train_samples_per_second': 0.409, 'train_steps_per_second': 0.102, 'total_flos': 158624907264000.0, 'train_loss': 39.19485677083333, 'epoch': 3.0})

In [15]:
from nltk.translate.bleu_score import sentence_bleu

# Evaluate BLEU score
def compute_bleu(data):
    references = [[word_tokenize(summary)] for summary in data["summary"]]
    predictions = []
    for article in data["document"]:
        inputs = tokenizer("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
        outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
        pred_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(word_tokenize(pred_summary))
    return corpus_bleu(references, predictions)

average_bleu = compute_bleu(test_data)
print(f"Average BLEU Score: {average_bleu}")

# Predict on test data and print BLEU score for each sample
for i in range(10):  # Predict and display summaries for first 10 examples
    article = test_data[i]["document"]
    reference_summary = test_data[i]["summary"]

    # Generate prediction
    inputs = tokenizer("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
    pred_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Tokenize references and prediction
    reference_tokens = word_tokenize(reference_summary)
    prediction_tokens = word_tokenize(pred_summary)

    # Compute BLEU score for the sample
    sample_bleu = sentence_bleu([reference_tokens], prediction_tokens)

    print(f"Article: {article}...")
    print(f"Reference Summary: {reference_summary}")
    print(f"Predicted Summary: {pred_summary}")
    print(f"BLEU Score: {sample_bleu}\n")


Average BLEU Score: 9.264942934643022e-158
Article: Liputan6 . com , Bangka : Kapal patroli Angkatan Laut Republik Indonesia , Belinyu , baru-baru ini , menangkap tiga kapal nelayan berbendera Thailand , yakni KM Binatama , KM Sumber Jaya II , dan KM Mataram di Perairan Belitung Utara . Ketiga kapal itu ditangkap karena melanggar zona ekonomi ekslusif Indonesia . Saat ini , kapal-kapal itu diamankan di Pos Lanal Pelabuhan Pangkalan Balam , Bangka-Belitung . Menurut Komandan Pangkalan TNI AL Bangka Letnan Kolonel Laut Fredy Egam , selain menangkap tiga kapal , ALRI juga memeriksa 43 anak buah kapal . Mereka disergap saat sedang mengangkat jaring pukat harimau di Perairan Belitung Utara . Dari jumlah itu , hanya enam orang yang dijadikan tersangka , yakni tiga nahkoda dan tiga kepala kamar mesin kapal . Sedangkan ABK yang lain akan dideportasi ke negara asalnya . Meski berhasil menahan enam tersangka , TNI AL gagal mengamankan ikan tangkapan nelayan Thailand tersebut . Sebab , sebelum pa