## PDF Summarization

In [None]:
!pip install transformers datasets accelerate -U
!pip install evaluate rouge-score

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-4.4.1 pya

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
import numpy as np

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

raw_datasets = load_dataset("knkarthick/samsum")
print("Dataset successfully loaded!")

MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128
PREFIX = "summarize: "

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14731 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Dataset successfully loaded!


In [None]:
def preprocess_function(examples):
    inputs = [PREFIX + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(500))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(50))
print(f"Using {len(small_train_dataset)} samples for training.")

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]



Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Using 500 samples for training.


In [None]:
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_summarization_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to="none",
    predict_with_generate=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("\n" + "="*50)
print("  Starting T5 Summarization Fine-Tuning...")
print("  Watch the Loss and ROUGE metrics update below.")
print("="*50 + "\n")

trainer.train()

print("\n" + "="*50)
print("Fine-Tuning Complete! Model is now saved.")
print("="*50)

trainer.save_model("./final_t5_summarizer")
tokenizer.save_pretrained("./final_t5_summarizer")

  trainer = Seq2SeqTrainer(



  Starting T5 Summarization Fine-Tuning...
  Watch the Loss and ROUGE metrics update below.



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,1.964162,0.3139,0.1172,0.2612,0.2619,17.84
2,1.993700,1.93141,0.335,0.1358,0.2794,0.2792,17.7
3,1.993700,1.92649,0.342,0.1451,0.2849,0.2845,17.92


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



Fine-Tuning Complete! Model is now saved.


('./final_t5_summarizer/tokenizer_config.json',
 './final_t5_summarizer/special_tokens_map.json',
 './final_t5_summarizer/spiece.model',
 './final_t5_summarizer/added_tokens.json',
 './final_t5_summarizer/tokenizer.json')

In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.6


In [None]:
from google.colab import files
import fitz

print("Please upload your PDF file now:")
uploaded = files.upload()

if uploaded:
    PDF_FILE_NAME = list(uploaded.keys())[0]
    print(f"File '{PDF_FILE_NAME}' detected and uploaded.")
else:
    print("No file uploaded. Please re-run the cell and upload a PDF.")
    PDF_FILE_NAME = None

def extract_text_from_pdf(pdf_path):
    text = ""
    if not pdf_path:
        return None
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text() + "\n"
        doc.close()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None
    return text.strip()

if PDF_FILE_NAME:
    long_text = extract_text_from_pdf(PDF_FILE_NAME)

    if long_text:
        print(f"Successfully extracted {len(long_text)} characters.")
    else:
        print("Could not extract text. Check file name and format.")
else:
    long_text = None

Please upload your PDF file now:


Saving 9. Introduction to Economics and Demand_UHU005.pdf to 9. Introduction to Economics and Demand_UHU005.pdf

✅ File '9. Introduction to Economics and Demand_UHU005.pdf' detected and uploaded.
Successfully extracted 4633 characters.


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_PATH = "./final_t5_summarizer"
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to("cuda")
    PREFIX = "summarize: "
    MAX_INPUT_LENGTH = 512
    print(f"Model '{MODEL_PATH}' loaded successfully.")

except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure you ran the fine-tuning cell and it completed successfully.")
    model = None


✅ Model './final_t5_summarizer' loaded successfully.


In [None]:
def generate_summary(text, model, tokenizer, max_input=MAX_INPUT_LENGTH):
    if not model or not text:
        return "Model not loaded or no text extracted."
    input_text = PREFIX + text

    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=max_input,
        truncation=True
    ).to(model.device)

    summary_ids = model.generate(
        inputs.input_ids,
        num_beams=4,
        max_length=150,
        min_length=30,
        early_stopping=True
    )

    summary = tokenizer.decode(
        summary_ids.squeeze(),
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )

    return summary

if long_text and model:
    final_summary = generate_summary(long_text, model, tokenizer)
    print("\n" + "="*70)
    print(f"       SUMMARY FOR THE DOCUMENT: {PDF_FILE_NAME}")
    print("="*70)
    print(final_summary)
    print("="*70)
elif not model:
    print("\nCannot run summarization: Model failed to load.")
elif not long_text:
    print("\nCannot run summarization: No text was successfully extracted from the PDF.")


       SUMMARY FOR THE DOCUMENT: 9. Introduction to Economics and Demand_UHU005.pdf
Economics is a social science that studies how people and societies use resources to produce, distribute, and consume goods and services. It also examines how people make decisions about allocating resources.
