## Abstractive Summarization : T5

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer,Seq2SeqTrainingArguments, BartTokenizer, BartForConditionalGeneration, DataCollatorForSeq2Seq, pipeline
from datasets import Dataset, DatasetDict
import os
import pandas as pd
import json
import re
from rouge_score import rouge_scorer
from statistics import mean
import numpy as np
import torch
import random
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
canonical_dev = os.listdir(os.path.join(os.path.abspath(os.getcwd()),"./liputan6_data/canonical/dev"))
canonical_test = os.listdir(os.path.join(os.path.abspath(os.getcwd()),"./liputan6_data/canonical/test"))
canonical_train = os.listdir(os.path.join(os.path.abspath(os.getcwd()),"./liputan6_data/canonical/train"))

In [3]:
def preprocessingText(text) : 
    text = re.sub(pattern = r"\(", repl = r"", string = text)
    text = re.sub(pattern = r"\)", repl = r"", string = text)
    text = re.sub(pattern = r"\s\.", repl = ".", string = text)
    text = re.sub(pattern = r" , ", repl = ", ", string = text)
    text = re.sub(pattern = r"\s\?", repl = "?", string = text)
    text = re.sub(pattern = r"\s!", repl = "!", string = text)
    text = re.sub(pattern = r"\s+", repl = " ", string = text)
    return text

In [4]:
clean_article_list = []
clean_summary = []
kunci_jawaban = []
clean_article = []

for fileJSON in canonical_train : 
    with open(os.path.join(os.path.abspath(os.getcwd()),"./liputan6_data/canonical/train",fileJSON),"r") as f : 
        file = json.load(f)
        clean_article_list.append([])
        sentence_result = "" 
        for sentence in file['clean_article'] : 
            sentence_result = " ".join(sentence)
            sentence_result = preprocessingText(sentence_result)
            clean_article_list[-1].append(sentence_result.lower())
            
        sentence_result = "" 
        for sentence in file['clean_summary'] : 
            sentence_result+=" ".join(sentence)
            sentence_result+=" "
        sentence_result = preprocessingText(sentence_result)
        clean_summary.append(sentence_result.lower())
        kunci_jawaban.append(file["extractive_summary"])
for article in clean_article_list : 
    clean_article.append(" ".join(article))


In [5]:
clean_article_list_test = []
clean_summary_test = []
kunci_jawaban_test = []
clean_article_test = []

for fileJSON in canonical_test : 
    with open(os.path.join(os.path.abspath(os.getcwd()),"./liputan6_data/canonical/test",fileJSON),"r") as f : 
        file = json.load(f)
        clean_article_list_test.append([])
        sentence_result = "" 
        for sentence in file['clean_article'] : 
            sentence_result = " ".join(sentence)
            sentence_result = preprocessingText(sentence_result)
            clean_article_list_test[-1].append(sentence_result.lower())
            
        sentence_result = "" 
        for sentence in file['clean_summary'] : 
            sentence_result+=" ".join(sentence)
            sentence_result+=" "
        sentence_result = preprocessingText(sentence_result)
        clean_summary_test.append(sentence_result.lower())
        kunci_jawaban_test.append(file["extractive_summary"])
for article in clean_article_list_test : 
    clean_article_test.append(" ".join(article))

In [6]:
model_name = "panggi/t5-small-indonesian-summarization-cased"

In [7]:
model_tokenizer = T5Tokenizer.from_pretrained(model_name)
model_base = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
dataset_train = Dataset.from_dict({"text":clean_article, "summary":clean_summary})
dataset_test = Dataset.from_dict({"text":clean_article_test, "summary":clean_summary_test})

In [7]:
range_train = list(range(0,dataset_train.shape[0]))
dataset_train = dataset_train.select(random.sample(range_train, k = 1000))

range_test = list(range(0,dataset_test.shape[0]))
dataset_test = dataset_test.select(random.sample(range_test, k = 2000))

In [8]:
dataset_full = DatasetDict({"train":dataset_train,"test":dataset_test})

In [11]:
def tokenizerFn(batch) : 
    inputs = model_tokenizer(batch["text"],padding = "max_length", truncation = True, max_length = 384)
    summary = model_tokenizer(batch["summary"], padding = "max_length", truncation = True, max_length = 160)
    inputs["labels"] = summary["input_ids"]
    return inputs

In [12]:
dataset_tokenized = dataset_full.map(tokenizerFn, batched = True, batch_size = 128)

Map: 100%|██████████| 8000/8000 [00:14<00:00, 565.42 examples/s]
Map: 100%|██████████| 2000/2000 [00:02<00:00, 674.60 examples/s]


In [13]:
dataCollator = DataCollatorForSeq2Seq(model = model_base, tokenizer = model_tokenizer)

In [14]:
trainingArgs = Seq2SeqTrainingArguments(
    output_dir = "./abstractiveSummary-trainer",
    do_train = True, 
    do_eval = True,
    learning_rate = 2e-5,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    fp16 = True,
    load_best_model_at_end=True,
    save_strategy="epoch",
    eval_strategy="epoch",
    metric_for_best_model = "rouge_calculation",
    greater_is_better=True,
    predict_with_generate=True,
    generation_max_length=160
)

In [15]:
def compute_fn(eval_pred) : 
    predictions, labels = eval_pred
    predictions = np.where(predictions == -100, model_tokenizer.pad_token_id, predictions)
    prediction = model_tokenizer.batch_decode(predictions, skip_special_tokens = True)
    label = np.where(labels != -100, labels, model_tokenizer.pad_token_id)
    label = model_tokenizer.batch_decode(label, skip_special_tokens = True)
    rougeScorer = rouge_scorer.RougeScorer(rouge_types=["rouge1","rouge2","rougeL"])
    hasil_list = []
    for p,l in zip(prediction, label) : 
        rouge1 = rougeScorer.score(p, l)["rouge1"].fmeasure
        rouge2 = rougeScorer.score(p, l)["rouge2"].fmeasure
        rougeL = rougeScorer.score(p, l)["rougeL"].fmeasure
        hasil_list.append(np.mean([rouge1,rouge2,rougeL]))
    return {"rouge_calculation":np.mean(hasil_list)}

In [16]:
trainer = Seq2SeqTrainer(
    args = trainingArgs,
    model = model_base,
    tokenizer = model_tokenizer,
    train_dataset = dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
    compute_metrics = compute_fn,
    data_collator = dataCollator
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()
trainer.evaluate()

In [None]:
# trainer.save_model("abstractiveModel")

## Testing and Evaluation

In [3]:
canonical_test = os.listdir(os.path.join(os.path.abspath(os.getcwd()),"./liputan6_data/canonical/test"))[:50]

In [9]:
rougeScorer = rouge_scorer.RougeScorer(rouge_types=["rougeL","rouge1","rouge2"])

In [25]:
model_tokenizer_saved = T5Tokenizer.from_pretrained("abstractiveModel")
model_base_saved = T5ForConditionalGeneration.from_pretrained("abstractiveModel")

In [None]:
def do_summarization(batch) : 
    inputs =  model_tokenizer_saved(batch["text"], padding = "max_length", truncation = True, max_length = 384, return_tensors = "pt")
    with torch.no_grad() : 
        summarized_text = model_base_saved.generate(**inputs, max_length = 160)
    summarized_text = model_tokenizer_saved.batch_decode(summarized_text, skip_special_tokens=True)
    return {"summary_predicted":summarized_text}

In [70]:
datasetCopy = dataset_full["test"]

In [71]:
datasetCopy = datasetCopy.map(do_summarization, batched = True, batch_size = 128)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,  290,   15,    1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])})


Map:   6%|▋         | 128/2000 [00:19<04:46,  6.53 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map:  13%|█▎        | 256/2000 [00:38<04:23,  6.61 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map:  19%|█▉        | 384/2000 [00:57<04:02,  6.67 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,   18,  170,    1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])})


Map:  26%|██▌       | 512/2000 [01:16<03:41,  6.72 examples/s]

KeysView({'input_ids': tensor([[ 5694,   323,     3,  ...,     0,     0,     0],
        [ 5694,   323,     3,  ...,     0,     0,     0],
        [ 5694,   323,     3,  ...,     0,     0,     0],
        ...,
        [ 5694,   323,     3,  ..., 10912,  6323,     1],
        [ 5694,   323,     3,  ...,     0,     0,     0],
        [ 5694,   323,     3,  ...,     1,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 0, 0]])})


Map:  32%|███▏      | 640/2000 [01:35<03:22,  6.73 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map:  38%|███▊      | 768/2000 [01:54<03:03,  6.72 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map:  45%|████▍     | 896/2000 [02:13<02:44,  6.70 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,   18,   19,    1],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,   14, 2363,    1],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map:  51%|█████     | 1024/2000 [02:33<02:26,  6.67 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map:  58%|█████▊    | 1152/2000 [02:51<02:06,  6.73 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map:  64%|██████▍   | 1280/2000 [03:10<01:45,  6.81 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ..., 4274, 9133,    1],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    7,  695,    1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])})


Map:  70%|███████   | 1408/2000 [03:29<01:27,  6.80 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ..., 7902,   13,    1],
        [5694,  323,    3,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map:  77%|███████▋  | 1536/2000 [03:47<01:07,  6.84 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map:  83%|████████▎ | 1664/2000 [04:06<00:49,  6.85 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ..., 3239,   17,    1],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map:  90%|████████▉ | 1792/2000 [04:25<00:30,  6.82 examples/s]

KeysView({'input_ids': tensor([[5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        ...,
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0],
        [5694,  323,    3,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map:  96%|█████████▌| 1920/2000 [04:44<00:11,  6.75 examples/s]

KeysView({'input_ids': tensor([[ 5694,   323,     3,  ...,     0,     0,     0],
        [ 5694,   323,     3,  ...,     0,     0,     0],
        [ 5694,   323,     3,  ..., 16780,   330,     1],
        ...,
        [ 5694,   323,     3,  ...,     0,     0,     0],
        [ 5694,   323,     3,  ...,     0,     0,     0],
        [ 5694,   323,     3,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


Map: 100%|██████████| 2000/2000 [04:58<00:00,  6.71 examples/s]


In [84]:
r1_list = []
r2_list = []
rl_list = []
for p,l in zip(datasetCopy["summary_predicted"][:], datasetCopy["summary"][:]) : 
    r1_list.append(rougeScorer.score(p,l)["rouge1"].fmeasure)
    r2_list.append(rougeScorer.score(p,l)["rouge2"].fmeasure)
    rl_list.append(rougeScorer.score(p,l)["rougeL"].fmeasure)

In [85]:
print("Hasil ROUGE: ")
print(f"Rouge1 : {mean(r1_list)}")
print(f"Rouge2 : {mean(r2_list)}")
print(f"RougeL : {mean(rl_list)}")

Hasil ROUGE: 
Rouge1 : 0.3792692577633349
Rouge2 : 0.20946316813588345
RougeL : 0.31032809118845567


## Abstractive Summarization : Encoder + Decoder

In [1]:
from transformers import EncoderDecoderModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
encoder_id = "cahya/bert2gpt-indonesian-summarization"
decoder_id = "cahya/bert2gpt-indonesian-summarization"

In [27]:
model_tokenizer = AutoTokenizer.from_pretrained(encoder_id)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
The encoder model config class: <class 'transformers.models.bert.configuration_bert.BertConfig'> is different from the decoder model config class: <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'>. It is not recommended to use the `AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder specific tokenizer classes.


In [30]:
model_base = EncoderDecoderModel.from_pretrained(encoder_id)

In [31]:
model_base.config.decoder_start_token_id = model_tokenizer.cls_token_id
model_base.config.pad_token_id = model_tokenizer.pad_token_id
model_base.config.eos_token_id = model_tokenizer.sep_token_id
model_base.vocab_size = model_tokenizer.vocab_size

In [37]:
def do_summarization(batch) : 
    inputs =  model_tokenizer(batch["text"], padding = "max_length", truncation = True, max_length = 384, return_tensors = "pt")
    with torch.no_grad() : 
        summarized_text = model_base.generate(**inputs, max_length = 160)
    summarized_text = model_tokenizer.batch_decode(summarized_text, skip_special_tokens=True)
    return {"summary_predicted":summarized_text}

In [50]:
datasetCopy = dataset_full["test"]

In [51]:
datasetCopy = datasetCopy.select(range(100))

In [53]:
datasetCopy = datasetCopy.map(do_summarization, batched = True, batch_size = 32)

Map: 100%|██████████| 100/100 [02:51<00:00,  1.72s/ examples]


In [59]:
rougeScorer = rouge_scorer.RougeScorer(rouge_types=["rougeL","rouge1","rouge2"])

In [60]:
r1_list = []
r2_list = []
rl_list = []
for p,l in zip(datasetCopy["summary_predicted"][:], datasetCopy["summary"][:]) : 
    r1_list.append(rougeScorer.score(p,l)["rouge1"].fmeasure)
    r2_list.append(rougeScorer.score(p,l)["rouge2"].fmeasure)
    rl_list.append(rougeScorer.score(p,l)["rougeL"].fmeasure)

In [61]:
print("Hasil ROUGE: ")
print(f"Rouge1 : {mean(r1_list)}")
print(f"Rouge2 : {mean(r2_list)}")
print(f"RougeL : {mean(rl_list)}")

Hasil ROUGE: 
Rouge1 : 0.4259107944700765
Rouge2 : 0.2486188432290399
RougeL : 0.35874525263007095
