In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset


In [2]:
ds = load_dataset("FiscalNote/billsum")
df_billsum = pd.DataFrame(ds['train'])
sample_df = df_billsum.sample(n=100, random_state=42).reset_index(drop=True)
sample_df

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

data/ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Unnamed: 0,text,summary,title
0,SECTION 1. SHORT TITLE.\n\n This Act may be...,Medicare Prescription Drug Integrity Act of 20...,Medicare Prescription Drug Integrity Act of 2013
1,SECTION 1. SHORT TITLE.\n\n This Act may be...,Back to Work Tax Credit - Amends the Internal ...,A bill to amend the Internal Revenue Code of 1...
2,SECTION 1. UTILIZING EVIDENCE FROM CLINICAL EX...,"This bill amends the Federal Food, Drug, and C...","To amend the Federal Food, Drug, and Cosmetic ..."
3,SECTION 1. SHORT TITLE.\n\n This Act may be...,Local Zoning and Property Rights Protection Ac...,Local Zoning and Property Rights Protection Ac...
4,SECTION 1. SHORT TITLE.\n\n This Act may be...,Paterson Great Falls National Historical Park ...,A bill to establish the Paterson Great Falls N...
...,...,...,...
95,SECTION 1. TREATMENT OF INTEREST EXPENSE OF QU...,Amends Internal Revenue Code provisions concer...,A bill to amend the Internal Revenue Code of 1...
96,SECTION 1. SHORT TITLE.\n\n This Act may be...,No Child Left Behind Improvement Act of 2005 -...,To amend the accountability provisions of the ...
97,SECTION 1. SHORT TITLE.\n\n This Act may be...,Green Communities Act - Directs the Secretary ...,To direct the Secretary of Commerce to make gr...
98,SECTION 1. SHORT TITLE.\n\n This Act may be...,Irene and Lee Tax Relief Storm Recovery Act of...,Irene and Lee Tax Relief Storm Recovery Act of...


In [3]:
sample_df.rename(columns={'text': 'source', 'summary': 'target'}, inplace=True)


In [4]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)


def prepare_data(model_name,
                 train_texts, train_labels,
                 val_texts=None, val_labels=None,
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length = 512)
    decodings = tokenizer(labels, truncation=True, padding=True, max_length = 256)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer


def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  if val_dataset is not None:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=2,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      evaluation_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=100,                  # number of update steps before evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=100,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=val_dataset,            # evaluation dataset
      tokenizer=tokenizer
    )

  else:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=2,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=100,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      tokenizer=tokenizer
    )

  return trainer



In [6]:
train_texts, train_labels = (list(sample_df['source'])), (list(sample_df['target']))
model_name = 'nsi319/legal-pegasus'
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
trainer.train()


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkaranbdave007[0m ([33mkaranbdave[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,6.5156
200,5.8368




TrainOutput(global_step=200, training_loss=6.176185607910156, metrics={'train_runtime': 429.113, 'train_samples_per_second': 0.466, 'train_steps_per_second': 0.466, 'total_flos': 288946441420800.0, 'train_loss': 6.176185607910156, 'epoch': 2.0})

In [8]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=b9f1b7ab5a64b215cccd6330667b558937b7e5f84ef75182ad50209a43ef3fa2
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [13]:
from rouge_score import rouge_scorer


In [9]:
def generate_summaries(trainer, texts, device='cuda' if torch.cuda.is_available() else 'cpu', max_length=256, num_beams=4):
    """
    Generate summaries for a list of input texts using the fine-tuned model from trainer.
    """
    model = trainer.model  # Access the fine-tuned model from trainer
    tokenizer = trainer.tokenizer  # Access the tokenizer from trainer
    model.eval()  # Set model to evaluation mode
    model.to(device)

    generated_summaries = []

    for text in texts:
        # Tokenize input text
        inputs = tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        inputs = {key: val.to(device) for key, val in inputs.items()}

        # Generate summary
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True
        )

        # Decode the generated summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        generated_summaries.append(summary)

    return generated_summaries

def evaluate_rouge(generated_summaries, reference_summaries):
    """
    Compute ROUGE scores for generated summaries against reference summaries using rouge_score.
    """
    # Initialize rouge_scorer with desired ROUGE metrics
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Compute ROUGE scores for each pair of generated and reference summaries
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for gen_summary, ref_summary in zip(generated_summaries, reference_summaries):
        scores = scorer.score(ref_summary, gen_summary)
        for key in rouge_scores:
            rouge_scores[key].append({
                'precision': scores[key].precision,
                'recall': scores[key].recall,
                'fmeasure': scores[key].fmeasure
            })

    # Aggregate scores (e.g., compute average for each metric)
    aggregated_scores = {}
    for key in rouge_scores:
        precision = sum(score['precision'] for score in rouge_scores[key]) / len(rouge_scores[key])
        recall = sum(score['recall'] for score in rouge_scores[key]) / len(rouge_scores[key])
        fmeasure = sum(score['fmeasure'] for score in rouge_scores[key]) / len(rouge_scores[key])
        aggregated_scores[key] = {'precision': precision, 'recall': recall, 'fmeasure': fmeasure}

    return aggregated_scores

In [10]:
df = pd.DataFrame(ds['test'])
sample_test = df.sample(n=3, random_state=42).reset_index(drop=True)
sample_test.rename(columns={'text': 'source', 'summary': 'target'}, inplace=True)
test_texts = list(sample_test['source'])
test_labels = list(sample_test['target'])

In [11]:
generated_summaries = generate_summaries(trainer, test_texts)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [15]:
# Print generated summaries and reference summaries
print("Generated Summaries vs Reference Summaries:")
for i, (gen_summary, ref_summary) in enumerate(zip(generated_summaries, test_labels)):
    print(f"\nExample {i+1}:")
    print(f"Generated Summary: {gen_summary}")
    print(f"Reference Summary: {ref_summary}")

# Compute ROUGE scores
rouge_scores = evaluate_rouge(generated_summaries, test_labels)

# Print ROUGE scores
print("\nROUGE Scores:")
for key, value in rouge_scores.items():
    print(f"{key}:")
    print(f"  Precision: {value['precision']:.4f}")
    print(f"  Recall: {value['recall']:.4f}")
    print(f"  F1 Score: {value['fmeasure']:.4f}")

Generated Summaries vs Reference Summaries:

Example 1:

Directs the Secretary to: (1) review the safety of all public railway-highway grade crossings in the United States; and (2) compile and submit to Congress, based on such review, a list of the 5,000 railway-highway grade crossings most in need of safety improvements.


Directs the Secretary to: (1) analyze all laws for preventing trespassing and vandalism on railroad property; and (2) develop model legislation providing for civil and criminal penalties for individuals who violate grade crossing signs, signals, or gates.

Requires the Secretary to inspect annually at least 2% of all highway-rail grade crossings in the 10 states with the highest rates of collisions at such crossings.

Requires the Secretary to investigate, and report to Congress, all fatal accidents in the United States (including fatal railroad accidents) that occur on or after enactment of this Act.

Example 2:
Generated Summary: The Surface Mining Control and Rec

In [None]:
import os
if not os.path.exists('./ouput_model/'):
    os.makedirs('./ouput_model/')
trainer.model.save_pretrained("./ouput_model/")
!zip -r ouput_model.zip ./ouput_model/