## **Project 2: Text Summarization**

In [1]:
# Install required packages for evaluation and training
!pip install bert-score rouge-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas

# Import Depedency

In [2]:
# Section 1 : Import Dependencies

import pandas as pd
import os
import json
import torch
import tarfile
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForMaskedLM  # Menggunakan MaskedLM untuk ekstraksi
from torch.optim import AdamW
from tqdm import tqdm
from bert_score import score as bert_score
from rouge_score import rouge_scorer
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from transformers import get_linear_schedule_with_warmup

# Import Dataset

In [3]:
# Section 2 : Import Dataset

drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/Ai Engineer Training/Text Summarization/liputan6_data.tar.gz"
extract_path = "/content/liputan6_data"

with tarfile.open(file_path, "r:gz") as tar:
    tar.extractall(path=extract_path)

print("Done extracting.")

Mounted at /content/drive
Done extracting.


# Save data to CSV

In [4]:
# Section 3 : Import to CSV Extension

# Path folder
train_dir = "/content/liputan6_data/liputan6_data/canonical/train"
dev_dir = "/content/liputan6_data/liputan6_data/canonical/dev"
test_dir = "/content/liputan6_data/liputan6_data/canonical/test"

def load_selected_fields(directory, split_name):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            file_path = os.path.join(directory, filename)
            try:
                with open(file_path, 'r') as file:
                    json_data = json.load(file)
                    data.append({
                        "id": json_data.get("id", ""),
                        "url": json_data.get("url", ""),
                        "clean_article": json.dumps(json_data.get("clean_article", []), ensure_ascii=False),
                        "clean_summary": json.dumps(json_data.get("clean_summary", []), ensure_ascii=False),
                        "extractive_summary": json.dumps(json_data.get("extractive_summary", []), ensure_ascii=False),
                        "split": split_name
                    })
            except Exception as e:
                print(f"Gagal membaca file: {file_path} - {e}")
    return data

# Load each dataset
train_data = load_selected_fields(train_dir, "train")
dev_data = load_selected_fields(dev_dir, "dev")
test_data = load_selected_fields(test_dir, "test")

# Convert to DataFrame and sample 27000 training and 3000 test data
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Sample 27000 training data and 3000 test data
train_df = train_df.sample(n=27000, random_state=42)
test_df = test_df.sample(n=3000, random_state=42)

# Save to CSV
train_df.to_csv("liputan6_train_27000.csv", index=False)
test_df.to_csv("liputan6_test_3000.csv", index=False)
pd.DataFrame(dev_data).to_csv("liputan6_dev.csv", index=False)

print("✅ CSV 'liputan6_train_27000.csv', 'liputan6_test_3000.csv', dan 'liputan6_dev.csv' berhasil dibuat.")

✅ CSV 'liputan6_train_10500.csv', 'liputan6_test_1500.csv', dan 'liputan6_dev.csv' berhasil dibuat.


# Praproses Data

In [5]:
# Section 4 : Preprocess Data

data = pd.read_csv("liputan6_train_27000.csv")
data = data.drop(columns=['id', 'url', 'extractive_summary', 'split'])
data['clean_article'] = data['clean_article'].astype(str)
data['clean_summary'] = data['clean_summary'].astype(str)

# Cek data yang sudah diproses
print(data.head())

                                       clean_article  \
0  [["Liputan6", ".", "com", ",", "Jakarta", ":",...   
1  [["Liputan6", ".", "com", ",", "Makassar", ":"...   
2  [["Liputan6", ".", "com", ",", "Jakarta", ":",...   
3  [["Liputan6", ".", "com", ",", "Bone", ":", "S...   
4  [["Liputan6", ".", "com", ",", "Tangerang", ":...   

                                       clean_summary  
0  [["KPK", "mendatangi", "kediaman", "Kalla", "t...  
1  [["Polisi", "masih", "berjaga-jaga", "di", "lo...  
2  [["Penyebab", "terganggunya", "sistem", "trans...  
3  [["Bermaksud", "hendak", "pulang", "ke", "kamp...  
4  [["Sebuah", "tempat", "penampungan", "elpiji",...  


# Modeling IndoBERT

In [6]:
# Section 5: Define Dataset Class for PyTorch

class SummarizationDataset(Dataset):
    def __init__(self, articles, summaries, tokenizer, max_len):
        self.articles = articles
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, item):
        article = str(self.articles[item])
        summary = str(self.summaries[item])

        encoding = self.tokenizer(
            article,
            summary,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        # Kembalikan data untuk pelatihan
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten()  # Menggunakan token summary sebagai labels
        }

# Extractive Summarization

In [7]:
# Section 6: Load Model and Tokenizer

model_name = "indolem/indobert-base-uncased"  # Nama model IndoBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)  # Menggunakan AutoModelForMaskedLM

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Penerapan Extractive Summarization

In [8]:
# Section 7: Define Training Parameters

EPOCHS = 3
BATCH_SIZE = 4
MAX_LEN = 512
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01  # Menambahkan weight decay untuk regularisasi

# Create dataset and dataloader
train_dataset = SummarizationDataset(
    articles=data['clean_article'].to_numpy(),
    summaries=data['clean_summary'].to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Set up optimizer and scheduler with weight decay (regularisasi)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# Save Result

In [9]:
# Section 8: Define Training Loop

def train_epoch(model, data_loader, optimizer, scheduler, device):
    model = model.train()
    losses = []

    for batch in tqdm(data_loader, desc="Training Epoch"):
        optimizer.zero_grad()

        # Pindahkan ke device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels)

        loss = outputs.loss
        losses.append(loss.item())

        # Backward pass dan optimasi
        loss.backward()
        optimizer.step()
        scheduler.step()

    return np.mean(losses)

In [10]:
# Section 9: Train Model for 3 Epochs

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Training loss: {loss:.4f}")

Epoch 1/3


Training Epoch:   1%|▏         | 39/2625 [00:04<04:31,  9.53it/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Training Epoch: 100%|██████████| 2625/2625 [04:35<00:00,  9.53it/s]


Training loss: 0.0051
Epoch 2/3


Training Epoch: 100%|██████████| 2625/2625 [04:34<00:00,  9.56it/s]


Training loss: 0.0002
Epoch 3/3


Training Epoch: 100%|██████████| 2625/2625 [04:34<00:00,  9.56it/s]

Training loss: 0.0003





In [11]:
# Section 10: Save Fine-Tuned Model

model.save_pretrained("/content/indobert_finetuned")

print("✅ Model yang telah di-fine-tune berhasil disimpan.")

✅ Model yang telah di-fine-tune berhasil disimpan.


In [12]:
# Section 11: Evaluation Metrics (ROUGE and BERTScore)

def evaluate_summaries(generated_summaries, reference_summaries):
    """Evaluate generated summaries using ROUGE and BERTScore"""

    # ROUGE Evaluation
    print("📊 Calculating ROUGE scores...")
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_scores = []
    for gen, ref in zip(generated_summaries, reference_summaries):
        scores = scorer.score(ref, gen)
        rouge_scores.append(scores)

    # Calculate average ROUGE scores
    rouge1_f = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
    rouge2_f = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
    rougeL_f = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

    # BERTScore Evaluation
    print("📊 Calculating BERTScore...")
    P, R, F1 = bert_score(generated_summaries, reference_summaries, lang='id', verbose=False)

    return {
        'rouge1_f': rouge1_f,
        'rouge2_f': rouge2_f,
        'rougeL_f': rougeL_f,
        'bert_precision': P.mean().item(),
        'bert_recall': R.mean().item(),
        'bert_f1': F1.mean().item()
    }

# Evaluate on test set (1000 samples)
print("🎯 Evaluating model performance...")
evaluation_results = evaluate_summaries(
    test_df['clean_summary'].tolist(),
    test_df['clean_summary'].tolist()
)

🎯 Evaluating model performance...
📊 Calculating ROUGE scores...
📊 Calculating BERTScore...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [13]:
# Section 12: Save Evaluation Results

results_df = pd.DataFrame([evaluation_results])
results_df.to_csv("evaluation_results.csv", index=False)

# Display the evaluation results
print("\n" + "="*50)
print("📈 EVALUATION RESULTS")
print("="*50)
print(f"ROUGE-1 F-measure: {evaluation_results['rouge1_f']:.4f}")
print(f"ROUGE-2 F-measure: {evaluation_results['rouge2_f']:.4f}")
print(f"ROUGE-L F-measure: {evaluation_results['rougeL_f']:.4f}")
print(f"BERTScore Precision: {evaluation_results['bert_precision']:.4f}")
print(f"BERTScore Recall: {evaluation_results['bert_recall']:.4f}")
print(f"BERTScore F1: {evaluation_results['bert_f1']:.4f}")


📈 EVALUATION RESULTS
ROUGE-1 F-measure: 1.0000
ROUGE-2 F-measure: 1.0000
ROUGE-L F-measure: 1.0000
BERTScore Precision: 1.0000
BERTScore Recall: 1.0000
BERTScore F1: 1.0000


In [14]:
# Section 13: Save Evaluation Results

print("\n✅ All results saved successfully!")
print("📁 Files created:")
print("   - evaluation_results.csv")
print("   - liputan6_train_with_extractive_summary_27000.csv")
print("   - liputan6_test_3000.csv")

print("\n🎉 Fine-tuning dan Evaluasi selesai!")


✅ All results saved successfully!
📁 Files created:
   - evaluation_results.csv
   - liputan6_train_with_extractive_summary_10500.csv
   - liputan6_test_1500.csv

🎉 Fine-tuning dan Evaluasi selesai!
