<a href="https://colab.research.google.com/github/MammadovN/Machine_Learning/blob/main/projects/04_natural_language_processing/text-summarization/text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate rouge_score

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
import evaluate

In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0")
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def preprocess(batch):
    inputs = ["summarize: " + doc for doc in batch["article"]]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["highlights"],
            max_length=150,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
).shuffle(seed=42)

In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # replace -100 in labels as pad_token_id for decoding
    labels = [
        [l if l != -100 else tokenizer.pad_token_id for l in label]
        for label in labels
    ]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: v * 100 for k, v in result.items()}

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="model_output",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    report_to="none",         # ← WandB & diğerleri devre dışı
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    predict_with_generate=True,
    gradient_accumulation_steps=2,
    fp16=torch.cuda.is_available()
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

In [None]:
results = trainer.evaluate()
print("Evaluation Results:", results)

In [None]:
def summarize(text, max_length=150, min_length=40, length_penalty=2.0, num_beams=4):
    inputs = tokenizer(
        "summarize: " + text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        min_length=min_length,
        length_penalty=length_penalty,
        num_beams=num_beams
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
sample = dataset["test"][0]
print("\n--- DEMO ---")
print("Article:\n", sample["article"])
print("\nTrue Summary:\n", sample["highlights"])
print("\nPredicted Summary:\n", summarize(sample["article"]))