In [2]:
#Imports & basic config
import os
import nltk
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

nltk.download("punkt")

# Model & training config
MODEL_NAME = "t5-small"  # can later try "google/t5-base" in bigger GPU
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 256

OUTPUT_DIR = "models/t5-billsum"

os.makedirs(OUTPUT_DIR, exist_ok=True)

rouge = evaluate.load("rouge")


[nltk_data] Downloading package punkt to /Users/pavan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
billsum = load_dataset("billsum")

In [8]:
from datasets import load_dataset

# Load dataset
billsum = load_dataset("billsum")

# Convert each split into Pandas DataFrame
train_df = billsum["train"].to_pandas()
test_df = billsum["test"].to_pandas()

train_df.head()


Unnamed: 0,text,summary,title
0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,Shields a business entity from civil liability...,A bill to limit the civil liability of busines...
1,SECTION 1. SHORT TITLE.\n\n This Act may be...,Human Rights Information Act - Requires certai...,Human Rights Information Act
2,SECTION 1. SHORT TITLE.\n\n This Act may be...,Jackie Robinson Commemorative Coin Act - Direc...,Jackie Robinson Commemorative Coin Act
3,SECTION 1. NONRECOGNITION OF GAIN WHERE ROLLOV...,Amends the Internal Revenue Code to provide (t...,To amend the Internal Revenue Code to provide ...
4,SECTION 1. SHORT TITLE.\n\n This Act may be...,Native American Energy Act - (Sec. 3) Amends t...,Native American Energy Act


In [9]:
test_df.head()

Unnamed: 0,text,summary,title
0,SECTION 1. ENVIRONMENTAL INFRASTRUCTURE.\n\n ...,Amends the Water Resources Development Act of ...,To make technical corrections to the Water Res...
1,That this Act may be cited as the ``Federal Fo...,Federal Forage Fee Act of 1993 - Subjects graz...,Federal Forage Fee Act of 1993
2,SECTION 1. SHORT TITLE.\n\n This Act may be...,. Merchant Marine of World War II Congression...,Merchant Marine of World War II Congressional ...
3,SECTION 1. SHORT TITLE.\n\n This Act may be...,Small Business Modernization Act of 2004 - Ame...,To amend the Internal Revenue Code of 1986 to ...
4,SECTION 1. SHORT TITLE.\n\n This Act may be...,Fair Access to Investment Research Act of 2016...,Fair Access to Investment Research Act of 2016


In [10]:
# Load full BillSum dataset
train_ds = billsum["train"]      # ~18,949
test_ds = billsum["test"]        # ~3,269
ca_test_ds = billsum["ca_test"]  # ~1,237

print("Train size:", len(train_ds))
print("Test size:", len(test_ds))
print("CA Test size:", len(ca_test_ds))

print(train_ds[0])

Train size: 18949
Test size: 3269
CA Test size: 1237
{'text': "SECTION 1. LIABILITY OF BUSINESS ENTITIES PROVIDING USE OF FACILITIES \n              TO NONPROFIT ORGANIZATIONS.\n\n    (a) Definitions.--In this section:\n            (1) Business entity.--The term ``business entity'' means a \n        firm, corporation, association, partnership, consortium, joint \n        venture, or other form of enterprise.\n            (2) Facility.--The term ``facility'' means any real \n        property, including any building, improvement, or appurtenance.\n            (3) Gross negligence.--The term ``gross negligence'' means \n        voluntary and conscious conduct by a person with knowledge (at \n        the time of the conduct) that the conduct is likely to be \n        harmful to the health or well-being of another person.\n            (4) Intentional misconduct.--The term ``intentional \n        misconduct'' means conduct by a person with knowledge (at the \n        time of the conduct) tha

In [11]:
#Sentence count stats
from nltk.tokenize import sent_tokenize

def count_sentences(text: str) -> int:
    return len(sent_tokenize(text))

# WARNING: This will take a bit of time but is fine for report once.
train_sentence_counts = [count_sentences(x["text"]) for x in train_ds]
test_sentence_counts = [count_sentences(x["text"]) for x in test_ds]
ca_sentence_counts = [count_sentences(x["text"]) for x in ca_test_ds]

total_train_sentences = sum(train_sentence_counts)
total_test_sentences = sum(test_sentence_counts)
total_ca_sentences = sum(ca_sentence_counts)

print("Total sentences (TRAIN):   ", total_train_sentences)
print("Total sentences (TEST):    ", total_test_sentences)
print("Total sentences (CA_TEST): ", total_ca_sentences)
print("TOTAL sentences (ALL):     ",
      total_train_sentences + total_test_sentences + total_ca_sentences)

print("\nAverage sentences per train doc:",
      np.mean(train_sentence_counts))


Total sentences (TRAIN):    877056
Total sentences (TEST):     149829
Total sentences (CA_TEST):  64694
TOTAL sentences (ALL):      1091579

Average sentences per train doc: 46.285081006913295


In [12]:
# Create train/validation split
# 5% of train as validation
train_valid = train_ds.train_test_split(test_size=0.05, seed=42)

train_dataset = train_valid["train"]
valid_dataset = train_valid["test"]

print("Train dataset:", len(train_dataset))
print("Validation dataset:", len(valid_dataset))


Train dataset: 18001
Validation dataset: 948


In [13]:
# Tokenizer & preprocessing function
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


def preprocess_function(examples):
    # T5 uses "summarize: " prefix convention
    inputs = ["summarize: " + doc for doc in examples["text"]]
    targets = examples["summary"]

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
    )

    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Use batched=True for faster preprocessing
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)

tokenized_valid = valid_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=valid_dataset.column_names,
)

tokenized_test = test_ds.map(
    preprocess_function,
    batched=True,
    remove_columns=test_ds.column_names,
)

print(tokenized_train)
print(tokenized_valid)


Map: 100%|██████████| 948/948 [00:00<00:00, 1062.48 examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 18001
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 948
})





In [14]:
# Data collator & ROUGE metric function
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=MODEL_NAME,
)

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    return preds, labels

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    # Decode predictions
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Decode labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )

    # Convert to percentage
    result = {k: round(v * 100, 2) for k, v in result.items()}

    return result


In [15]:
# Load model & define Trainer
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# You can tune these based on Colab GPU
batch_size = 4
num_train_epochs = 2  # For full training you can increase later
learning_rate = 3e-4

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    fp16=True,  # if Colab GPU supports it
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",  # disable WandB etc unless you want it
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [None]:
# Train
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Evaluate on test set + save model
test_metrics = trainer.evaluate(tokenized_test, max_length=MAX_TARGET_LENGTH)
print("Test metrics:", test_metrics)
# Save model & tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model saved to: {OUTPUT_DIR}")


Used BillSum full dataset (N ≈ 23k documents, > 9M sentences).

Fine-tuned T5-small for abstractive summarization.

Report ROUGE-1, ROUGE-2, ROUGE-L on validation & test.

Describe:

Preprocessing pipeline (prefix, truncation, max lengths)

Hyperparameters: LR, epochs, batch size

Comparison: base T5 vs fine-tuned (you can generate a few before/after examples manually)

## BART MODEL:

In [16]:
# BART config & tokenizer
from transformers import BartTokenizerFast, BartForConditionalGeneration

BART_MODEL_NAME = "facebook/bart-base"
BART_OUTPUT_DIR = "models/bart-billsum"
os.makedirs(BART_OUTPUT_DIR, exist_ok=True)

bart_tokenizer = BartTokenizerFast.from_pretrained(BART_MODEL_NAME)

BART_MAX_INPUT_LENGTH = 1024
BART_MAX_TARGET_LENGTH = 256


In [17]:
# Preprocessing function for BART
def preprocess_bart_function(examples):
    inputs = examples["text"]
    targets = examples["summary"]

    model_inputs = bart_tokenizer(
        inputs,
        max_length=BART_MAX_INPUT_LENGTH,
        truncation=True,
    )

    with bart_tokenizer.as_target_tokenizer():
        labels = bart_tokenizer(
            targets,
            max_length=BART_MAX_TARGET_LENGTH,
            truncation=True,
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_train_bart = train_dataset.map(
    preprocess_bart_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)

tokenized_valid_bart = valid_dataset.map(
    preprocess_bart_function,
    batched=True,
    remove_columns=valid_dataset.column_names,
)

tokenized_test_bart = test_ds.map(
    preprocess_bart_function,
    batched=True,
    remove_columns=test_ds.column_names,
)

print(tokenized_train_bart)
print(tokenized_valid_bart)


Map: 100%|██████████| 18001/18001 [00:19<00:00, 925.88 examples/s]
Map: 100%|██████████| 948/948 [00:00<00:00, 1112.87 examples/s]
Map: 100%|██████████| 3269/3269 [00:03<00:00, 1069.63 examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 18001
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 948
})





In [18]:
# Data collator for BART
bart_data_collator = DataCollatorForSeq2Seq(
    tokenizer=bart_tokenizer,
    model=BART_MODEL_NAME,
)


In [20]:
# Load BART model & define Trainer
bart_model = BartForConditionalGeneration.from_pretrained(BART_MODEL_NAME)

bart_batch_size = 4          # you can lower to 2 if you get OOM
bart_num_train_epochs = 2    # same as T5 so comparison is fair
bart_learning_rate = 3e-4

bart_training_args = Seq2SeqTrainingArguments(
    output_dir=BART_OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=bart_learning_rate,
    per_device_train_batch_size=bart_batch_size,
    per_device_eval_batch_size=bart_batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=bart_num_train_epochs,
    predict_with_generate=True,
    fp16=True,  # if GPU supports it
    logging_dir="./logs_bart",
    logging_steps=50,
    report_to="none",
)

bart_trainer = Seq2SeqTrainer(
    model=bart_model,
    args=bart_training_args,
    train_dataset=tokenized_train_bart,
    eval_dataset=tokenized_valid_bart,
    tokenizer=bart_tokenizer,
    data_collator=bart_data_collator,
    compute_metrics=compute_metrics,  # same ROUGE metric
)


  bart_trainer = Seq2SeqTrainer(


In [None]:
# Train BART
bart_trainer.train()

In [None]:
# Evaluate BART on test set + save model
bart_test_metrics = bart_trainer.evaluate(
    tokenized_test_bart,
    max_length=BART_MAX_TARGET_LENGTH,
)
print("BART Test metrics:", bart_test_metrics)

bart_trainer.save_model(BART_OUTPUT_DIR)
bart_tokenizer.save_pretrained(BART_OUTPUT_DIR)

print(f"BART model saved to: {BART_OUTPUT_DIR}")


Two models:

t5-small fine-tuned on BillSum

facebook/bart-base fine-tuned on BillSum

In your writeup:

Experimental Setup section:

Describe both models and why you chose them (encoder–decoder transformers for summarization).

Mention hyperparameters: epochs, LR, batch size, max lengths.

Dataset section:

Use numbers from the sentence-count cell to show you used a large dataset (≥ 50,000 sentences).

Results / Discussion:

Present ROUGE scores from:

test_metrics (T5)

bart_test_metrics (BART)

Compare:

Which performs better on ROUGE-1 / ROUGE-2 / ROUGE-L

Any qualitative differences: e.g., T5 more concise, BART more verbose or faithful.

Conclusion:

Briefly mention which model you ultimately used in the live app (your current backend uses T5; you can note BART as an alternative/enhancement).