In [None]:
!pip install -q transformers datasets evaluate accelerate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
)

In [None]:
import evaluate
import numpy as np
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cpu'

In [None]:
raw_datasets = load_dataset("cnn_dailymail", "3.0.0")

#tiny subsets for quick training
small_train = raw_datasets["train"].shuffle(seed=42).select(range(1000))
small_test = raw_datasets["test"].shuffle(seed=42).select(range(200))
small_train[0]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

In [None]:
model_name = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [None]:
max_input_length = 512    # article length
max_target_length = 128   # summary length

def preprocess_function(examples):
    #Tokenize the article (input text)
    model_inputs = tokenizer(
        examples["article"],
        max_length=max_input_length,
        truncation=True,
    )

    # Tokenize the summary (target text)
    labels = tokenizer(
        text_target=examples["highlights"],
        max_length=max_target_length,
        truncation=True,
    )

    # attach labels to inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


In [None]:
tokenized_train = small_train.map(preprocess_function, batched=True)
tokenized_val   = small_val.map(preprocess_function, batched=True)

# Converting to PyTorch tensors for Trainer
tokenized_train = tokenized_train.with_format("torch")
tokenized_val   = tokenized_val.with_format("torch")


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)


    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computing ROUGE
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )

    # Converting to percentages (0–100)
    result = {k: round(v * 100, 2) for k, v in result.items()}
    return result


In [None]:
batch_size = 4  # smaller because seq2seq uses more memory

training_args = TrainingArguments(
    output_dir="t5-summarizer",
    evaluation_strategy="epoch",   # evaluating every epoch
    save_strategy="epoch",         # saving checkpoints every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    report_to="none",
    predict_with_generate=True,
    generation_max_length=128,     # max summary length
    generation_num_beams=4,        # BEAM SEARCH
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
metrics = trainer.evaluate()
metrics


In [None]:
def summarize(text, max_new_tokens=80, num_beams=4):
    # adding T5 task prefix (T5 is text-to-text)
    input_text = "summarize: " + text

    # Tokenize
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=512,
    ).to(device)

    # generating summary using beam search
    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            early_stopping=True,
        )

    # decoding to string
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary
