
# Project 03 — **Task #3: Encoder–Decoder (T5/BART) Summarization**



In [14]:

!pip -q install --upgrade transformers datasets evaluate rouge_score sentencepiece accelerate gradio


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [15]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np
import torch
from datetime import datetime

# ------------------ Configuration ------------------
MODEL_NAME = "t5-small"
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128
VAL_MAX_TARGET_LENGTH = 128
TRUNCATION = True


TRAIN_SAMPLES = 20000
VAL_SAMPLES = 1000

BATCH_SIZE = 2
EVAL_STEPS = 500
LOGGING_STEPS = 100
NUM_EPOCHS = 3
USE_FP16 = torch.cuda.is_available()
SEED = 42

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Model:", MODEL_NAME)

Torch: 2.8.0+cu126
CUDA available: True
Model: t5-small


In [16]:

dataset = load_dataset("cnn_dailymail", "3.0.0")

if TRAIN_SAMPLES is not None:
    dataset["train"] = dataset["train"].select(range(min(TRAIN_SAMPLES, len(dataset["train"]))))
if VAL_SAMPLES is not None:
    dataset["validation"] = dataset["validation"].select(range(min(VAL_SAMPLES, len(dataset["validation"]))))

dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [17]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
TASK_PREFIX = "summarize: "

def preprocess_function(batch):
    inputs = [TASK_PREFIX + doc for doc in batch["article"]]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=TRUNCATION
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["highlights"],
            max_length=MAX_TARGET_LENGTH,
            truncation=TRUNCATION
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_datasets


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11490
    })
})

In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    preds = preds.astype(np.int32)
    preds[preds >= tokenizer.vocab_size] = tokenizer.pad_token_id
    preds[preds < 0] = tokenizer.pad_token_id

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )
    result = {k: round(v * 100, 2) for k, v in result.items()}  # percentages
    return result

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [19]:
from datetime import datetime
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

run_name = f"t5_summarization_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
print(f"Run name: {run_name}")

args = Seq2SeqTrainingArguments(
    output_dir="./t5-cnn-summarization",
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    logging_steps=LOGGING_STEPS,
    save_steps=EVAL_STEPS,
    save_total_limit=2,
    learning_rate=2e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    fp16=USE_FP16,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    seed=SEED
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer

Run name: t5_summarization_2025-11-12_23-23-57


  trainer = Seq2SeqTrainer(


<transformers.trainer_seq2seq.Seq2SeqTrainer at 0x7ec26012ec30>

In [20]:

train_result = trainer.train()
train_result


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
500,2.0958,2.161655,24.74,9.45,20.01,20.0
1000,2.1004,2.140749,24.19,8.96,19.53,19.51
1500,2.1001,2.129545,24.31,8.99,19.65,19.64
2000,2.0449,2.143523,24.78,9.32,19.97,19.97
2500,2.0552,2.131845,24.78,9.54,20.04,20.04
3000,1.9485,2.14481,24.29,9.1,19.76,19.74
3500,1.9379,2.13675,24.45,9.09,19.86,19.86


KeyboardInterrupt: 

In [None]:
metrics = trainer.evaluate(max_length=VAL_MAX_TARGET_LENGTH, num_beams=4)
metrics

In [None]:

def show_examples(n=3, max_source_chars=600):
    samples = dataset["validation"].select(range(min(n, len(dataset["validation"]))))
    inputs = [TASK_PREFIX + art for art in samples["article"]]
    inputs_tokenized = tokenizer(inputs, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LENGTH).to(model.device)
    with torch.no_grad():
        generated = model.generate(
            **inputs_tokenized,
            max_length=MAX_TARGET_LENGTH,
            num_beams=4
        )
    preds = tokenizer.batch_decode(generated, skip_special_tokens=True)

    for i, (art, gold, pred) in enumerate(zip(samples["article"], samples["highlights"], preds), 1):
        print("="*120)
        print(f"[Example {i}]")
        print("- Article (truncated) -")
        print(art[:max_source_chars] + ("..." if len(art) > max_source_chars else ""))
        print("\n- Reference Summary -")
        print(gold.strip())
        print("\n- Model Summary -")
        print(pred.strip())

show_examples(n=3)


In [21]:

trainer.save_model("./t5-cnn-summarization/best")
tokenizer.save_pretrained("./t5-cnn-summarization/best")
print("Saved to ./t5-cnn-summarization/best")


Saved to ./t5-cnn-summarization/best


In [22]:
import gradio as gr

pipe_model = trainer.model
pipe_tokenizer = tokenizer

def summarize(text):
    if not text.strip():
        return ""
    inp = TASK_PREFIX + text
    toks = pipe_tokenizer(inp, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LENGTH).to(pipe_model.device)
    with torch.no_grad():
        out_ids = pipe_model.generate(
            **toks,
            max_length=MAX_TARGET_LENGTH,
            num_beams=4
        )
    return pipe_tokenizer.decode(out_ids[0], skip_special_tokens=True)

demo = gr.Interface(fn=summarize, inputs=gr.Textbox(lines=5, autoscroll=False), outputs=gr.Textbox(lines=10, autoscroll=False), title="T5 Summarizer (Task #3)")
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ef4b2e2a005d055382.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [23]:
import os

output_zip_path = "best_model.zip"
folder_to_zip = "./t5-cnn-summarization/best"

# Check if the folder exists before zipping
if os.path.exists(folder_to_zip):
    # Use shutil to make a zip archive
    # The base_name is the path without the extension, so the output will be best_model.zip
    # The format is 'zip'
    # The root_dir is the directory where the folder_to_zip is located
    # The base_dir is the folder_to_zip itself
    import shutil
    shutil.make_archive(os.path.splitext(output_zip_path)[0], 'zip', os.path.dirname(folder_to_zip), os.path.basename(folder_to_zip))
    print(f"Successfully created {output_zip_path} containing the '{folder_to_zip}' folder.")
    print("You can now download this file from the Colab file browser (left-hand sidebar).")
else:
    print(f"Error: The folder '{folder_to_zip}' does not exist. Please ensure the model was saved correctly.")

Successfully created best_model.zip containing the './t5-cnn-summarization/best' folder.
You can now download this file from the Colab file browser (left-hand sidebar).
