In [1]:
# 01: Imports & setup

from datasets import load_dataset
import pandas as pd
import nltk

# Download sentence tokenizer (only first time)
nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/pavan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# 02: Load full BillSum dataset

train_ds = load_dataset("billsum", split="train")
test_ds = load_dataset("billsum", split="test")
ca_test_ds = load_dataset("billsum", split="ca_test")

print("Train size:", len(train_ds))
print("Test size:", len(test_ds))
print("CA Test size:", len(ca_test_ds))


Train size: 18949
Test size: 3269
CA Test size: 1237


In [3]:
# 03: Convert train split to a pandas DataFrame

train_df = pd.DataFrame(train_ds)

print("Train DataFrame shape:", train_df.shape)   # (num_docs, num_columns)
print("Columns:", list(train_df.columns))

train_df.head()


Train DataFrame shape: (18949, 3)
Columns: ['text', 'summary', 'title']


Unnamed: 0,text,summary,title
0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,Shields a business entity from civil liability...,A bill to limit the civil liability of busines...
1,SECTION 1. SHORT TITLE.\n\n This Act may be...,Human Rights Information Act - Requires certai...,Human Rights Information Act
2,SECTION 1. SHORT TITLE.\n\n This Act may be...,Jackie Robinson Commemorative Coin Act - Direc...,Jackie Robinson Commemorative Coin Act
3,SECTION 1. NONRECOGNITION OF GAIN WHERE ROLLOV...,Amends the Internal Revenue Code to provide (t...,To amend the Internal Revenue Code to provide ...
4,SECTION 1. SHORT TITLE.\n\n This Act may be...,Native American Energy Act - (Sec. 3) Amends t...,Native American Energy Act


In [4]:
# 04: Count sentences per document in train split

from nltk.tokenize import sent_tokenize

def count_sentences(text: str) -> int:
    return len(sent_tokenize(text))

train_df["sentence_count"] = train_df["text"].apply(count_sentences)

print("Train shape (after adding sentence_count):", train_df.shape)

total_sentences_train = train_df["sentence_count"].sum()
print("Total sentences in TRAIN split:", total_sentences_train)

# Show first 5 rows with sentence_count
train_df[["title", "sentence_count", "text", "summary"]].head()


Train shape (after adding sentence_count): (18949, 4)
Total sentences in TRAIN split: 877056


Unnamed: 0,title,sentence_count,text,summary
0,A bill to limit the civil liability of busines...,15,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,Shields a business entity from civil liability...
1,Human Rights Information Act,87,SECTION 1. SHORT TITLE.\n\n This Act may be...,Human Rights Information Act - Requires certai...
2,Jackie Robinson Commemorative Coin Act,60,SECTION 1. SHORT TITLE.\n\n This Act may be...,Jackie Robinson Commemorative Coin Act - Direc...
3,To amend the Internal Revenue Code to provide ...,31,SECTION 1. NONRECOGNITION OF GAIN WHERE ROLLOV...,Amends the Internal Revenue Code to provide (t...
4,Native American Energy Act,126,SECTION 1. SHORT TITLE.\n\n This Act may be...,Native American Energy Act - (Sec. 3) Amends t...


In [5]:
# 05: Sentence stats for all splits (train + test + ca_test)

def total_sentences_for_split(ds):
    df = pd.DataFrame(ds)
    df["sentence_count"] = df["text"].apply(count_sentences)
    return df["sentence_count"].sum(), df

total_train_sentences, _ = total_sentences_for_split(train_ds)
total_test_sentences, _ = total_sentences_for_split(test_ds)
total_ca_test_sentences, _ = total_sentences_for_split(ca_test_ds)

print("Total sentences (TRAIN):   ", total_train_sentences)
print("Total sentences (TEST):    ", total_test_sentences)
print("Total sentences (CA_TEST): ", total_ca_test_sentences)
print("TOTAL sentences (ALL):     ",
      total_train_sentences + total_test_sentences + total_ca_test_sentences)


Total sentences (TRAIN):    877056
Total sentences (TEST):     149829
Total sentences (CA_TEST):  64694
TOTAL sentences (ALL):      1091579
