In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch



  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [5]:
## Load data, but because my M1 Mac was too slow, I only used 100 examples for this demo
billsum = load_dataset("billsum", split="ca_test")
# print billsum size
print("billsum size: ", len(billsum))

# only keep 100 examples for this demo
billsum = billsum.select(range(100))

print("billsum size: ", len(billsum))
billsum = billsum.train_test_split(test_size=0.2)
billsum["train"][0]

billsum size:  1237
billsum size:  100


{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10820 of the Corporations Code is amended to read:\n10820.\n(a) “Health care service plan,” as used in this section means a corporation that is a health care service plan defined in the Knox-Keene Health Care Service Plan Act of 1975 (Chapter 2.2 (commencing with Section 1340) of Division 2 of the Health and Safety Code), other than a corporation that is exempted from that act by subdivision (c) of Section 1343 of the Health and Safety Code.\n(b) A health care service plan may be formed under or subject to Part 2 (commencing with Section 5110) of this division or Part 3 (commencing with Section 7110) of this division.\nSEC. 2.\nSection 1343 of the Health and Safety Code is amended to read:\n1343.\n(a) This chapter shall apply to health care service plans and specialized health care service plan contracts as defined in subdivisions (f) and (o) of Section 1345.\n(b) The director may by the adopt

In [6]:
## t5-small as checkpoint
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
# add prefix and preprocess function
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [8]:
# preprocess data use map function can speed up the process
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [9]:
# Seq2Seq models are usually trained with a batch size of 16 or 32,
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [10]:
# evaluate function use rouge metric
rouge = evaluate.load("rouge")

In [11]:
# compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [12]:
# load teh Seq2Seq model
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [13]:
# Because I am using M1 Mac, so set the device to mps, if you are using GPU, you can set it to cuda
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)
device

device(type='mps')

In [14]:
## trainer setup
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [15]:
# start training
trainer.train()

  0%|          | 0/200 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  if unfinished_sequences.max() == 0:


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 4.400156497955322, 'eval_rouge1': 0.1333, 'eval_rouge2': 0.0378, 'eval_rougeL': 0.1094, 'eval_rougeLsum': 0.109, 'eval_gen_len': 19.0, 'eval_runtime': 5.5328, 'eval_samples_per_second': 3.615, 'eval_steps_per_second': 0.542, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.8224728107452393, 'eval_rouge1': 0.1325, 'eval_rouge2': 0.0351, 'eval_rougeL': 0.1085, 'eval_rougeLsum': 0.1081, 'eval_gen_len': 19.0, 'eval_runtime': 3.533, 'eval_samples_per_second': 5.661, 'eval_steps_per_second': 0.849, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.5342764854431152, 'eval_rouge1': 0.1343, 'eval_rouge2': 0.0361, 'eval_rougeL': 0.1109, 'eval_rougeLsum': 0.1109, 'eval_gen_len': 19.0, 'eval_runtime': 3.545, 'eval_samples_per_second': 5.642, 'eval_steps_per_second': 0.846, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.391953229904175, 'eval_rouge1': 0.1253, 'eval_rouge2': 0.0307, 'eval_rougeL': 0.1069, 'eval_rougeLsum': 0.1067, 'eval_gen_len': 19.0, 'eval_runtime': 3.4996, 'eval_samples_per_second': 5.715, 'eval_steps_per_second': 0.857, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.28490948677063, 'eval_rouge1': 0.1239, 'eval_rouge2': 0.0275, 'eval_rougeL': 0.1028, 'eval_rougeLsum': 0.103, 'eval_gen_len': 19.0, 'eval_runtime': 3.5545, 'eval_samples_per_second': 5.627, 'eval_steps_per_second': 0.844, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.2041313648223877, 'eval_rouge1': 0.1227, 'eval_rouge2': 0.0237, 'eval_rougeL': 0.1015, 'eval_rougeLsum': 0.1016, 'eval_gen_len': 19.0, 'eval_runtime': 3.5236, 'eval_samples_per_second': 5.676, 'eval_steps_per_second': 0.851, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.143918037414551, 'eval_rouge1': 0.1234, 'eval_rouge2': 0.0218, 'eval_rougeL': 0.1022, 'eval_rougeLsum': 0.1023, 'eval_gen_len': 19.0, 'eval_runtime': 3.5146, 'eval_samples_per_second': 5.691, 'eval_steps_per_second': 0.854, 'epoch': 7.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.097933769226074, 'eval_rouge1': 0.1286, 'eval_rouge2': 0.026, 'eval_rougeL': 0.1057, 'eval_rougeLsum': 0.106, 'eval_gen_len': 19.0, 'eval_runtime': 3.5316, 'eval_samples_per_second': 5.663, 'eval_steps_per_second': 0.849, 'epoch': 8.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.0624492168426514, 'eval_rouge1': 0.1298, 'eval_rouge2': 0.0289, 'eval_rougeL': 0.1048, 'eval_rougeLsum': 0.105, 'eval_gen_len': 19.0, 'eval_runtime': 3.6247, 'eval_samples_per_second': 5.518, 'eval_steps_per_second': 0.828, 'epoch': 9.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.0350544452667236, 'eval_rouge1': 0.1286, 'eval_rouge2': 0.0299, 'eval_rougeL': 0.105, 'eval_rougeLsum': 0.1053, 'eval_gen_len': 19.0, 'eval_runtime': 3.5485, 'eval_samples_per_second': 5.636, 'eval_steps_per_second': 0.845, 'epoch': 10.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.013465404510498, 'eval_rouge1': 0.1292, 'eval_rouge2': 0.0288, 'eval_rougeL': 0.1066, 'eval_rougeLsum': 0.1068, 'eval_gen_len': 19.0, 'eval_runtime': 3.497, 'eval_samples_per_second': 5.719, 'eval_steps_per_second': 0.858, 'epoch': 11.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.995612621307373, 'eval_rouge1': 0.1148, 'eval_rouge2': 0.0195, 'eval_rougeL': 0.0942, 'eval_rougeLsum': 0.0938, 'eval_gen_len': 19.0, 'eval_runtime': 3.4929, 'eval_samples_per_second': 5.726, 'eval_steps_per_second': 0.859, 'epoch': 12.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.9813499450683594, 'eval_rouge1': 0.1167, 'eval_rouge2': 0.0195, 'eval_rougeL': 0.0943, 'eval_rougeLsum': 0.0939, 'eval_gen_len': 19.0, 'eval_runtime': 3.527, 'eval_samples_per_second': 5.671, 'eval_steps_per_second': 0.851, 'epoch': 13.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.9697329998016357, 'eval_rouge1': 0.1129, 'eval_rouge2': 0.0204, 'eval_rougeL': 0.0935, 'eval_rougeLsum': 0.093, 'eval_gen_len': 19.0, 'eval_runtime': 3.4875, 'eval_samples_per_second': 5.735, 'eval_steps_per_second': 0.86, 'epoch': 14.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.960644483566284, 'eval_rouge1': 0.1129, 'eval_rouge2': 0.0204, 'eval_rougeL': 0.0935, 'eval_rougeLsum': 0.093, 'eval_gen_len': 19.0, 'eval_runtime': 3.5006, 'eval_samples_per_second': 5.713, 'eval_steps_per_second': 0.857, 'epoch': 15.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.953394651412964, 'eval_rouge1': 0.1125, 'eval_rouge2': 0.0198, 'eval_rougeL': 0.0934, 'eval_rougeLsum': 0.0931, 'eval_gen_len': 19.0, 'eval_runtime': 3.5039, 'eval_samples_per_second': 5.708, 'eval_steps_per_second': 0.856, 'epoch': 16.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.9477615356445312, 'eval_rouge1': 0.1117, 'eval_rouge2': 0.0199, 'eval_rougeL': 0.0955, 'eval_rougeLsum': 0.0951, 'eval_gen_len': 19.0, 'eval_runtime': 3.5111, 'eval_samples_per_second': 5.696, 'eval_steps_per_second': 0.854, 'epoch': 17.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.94362735748291, 'eval_rouge1': 0.1117, 'eval_rouge2': 0.0199, 'eval_rougeL': 0.0955, 'eval_rougeLsum': 0.0951, 'eval_gen_len': 19.0, 'eval_runtime': 3.5118, 'eval_samples_per_second': 5.695, 'eval_steps_per_second': 0.854, 'epoch': 18.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.9411141872406006, 'eval_rouge1': 0.1117, 'eval_rouge2': 0.0199, 'eval_rougeL': 0.0955, 'eval_rougeLsum': 0.0951, 'eval_gen_len': 19.0, 'eval_runtime': 3.4982, 'eval_samples_per_second': 5.717, 'eval_steps_per_second': 0.858, 'epoch': 19.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.9403185844421387, 'eval_rouge1': 0.1117, 'eval_rouge2': 0.0199, 'eval_rougeL': 0.0955, 'eval_rougeLsum': 0.0951, 'eval_gen_len': 19.0, 'eval_runtime': 3.5438, 'eval_samples_per_second': 5.644, 'eval_steps_per_second': 0.847, 'epoch': 20.0}
{'train_runtime': 473.0661, 'train_samples_per_second': 3.382, 'train_steps_per_second': 0.423, 'train_loss': 3.460495910644531, 'epoch': 20.0}


TrainOutput(global_step=200, training_loss=3.460495910644531, metrics={'train_runtime': 473.0661, 'train_samples_per_second': 3.382, 'train_steps_per_second': 0.423, 'train_loss': 3.460495910644531, 'epoch': 20.0})

In [1]:
# That just for push the model to hub after fine-tuning
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1701042979.daruiruidekuailetong.local.19163.0:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

'https://huggingface.co/KRayRay/my_awesome_billsum_model/tree/main/'

In [17]:
# to try this model, you can use the following code
from transformers import pipeline

text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

summarizer = pipeline("summarization", model="KRayRay/my_awesome_billsum_model")

print(summarizer(text))

  _torch_pytree._register_pytree_node(


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': "the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs . it's the most aggressive action on tackling the climate crisis in American history . no one making under $400,000 per year will pay a penny more in taxes."}]
