In [1]:
from TALib import TALib

In [2]:
ta_lib = TALib()

## Loading the dataset billsum

In [3]:
from datasets  import load_dataset

billsum = load_dataset("billsum", split="train")

In [4]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq , AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer


tokenizer = AutoTokenizer.from_pretrained(TALib.TK_ckpt)  # use tokenizer from Hugging Face
model = AutoModelForSeq2SeqLM.from_pretrained(TALib.CHECKPOINT)  

In [5]:
preprocess_function = TALib.preprocess_function_pass_tokenizer(tokenizer)

In [6]:
billsum = billsum.train_test_split(test_size=0.2)
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/15159 [00:00<?, ? examples/s]

Map:   0%|          | 0/3790 [00:00<?, ? examples/s]

## Data Handler

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=TALib.CHECKPOINT)

In [8]:
import evaluate

rouge = evaluate.load("rouge")

In [9]:
from rich import print

In [10]:
print(model)

## TA's trainer for fine-tune T5-small

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="TA_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,  # Assuming you still want weight decay as it wasn't mentioned to remove
    save_total_limit=3,  # Assuming to maintain the save limit as before
    num_train_epochs=4,
    lr_scheduler_type="linear",
    seed=42,
    fp16=True,  # You mentioned "Native AMP" for mixed precision training which is generally enabled by setting fp16=True in Transformers
    logging_steps=10,  # Assuming to keep the logging frequency as before
    predict_with_generate=True,
)

compute_metrics = TALib.compute_metrics_pass_tokenizer(tokenizer)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [12]:
ratio = TALib.show_param_ratio(model=model)


In [13]:
print(ratio)

## Prediction Part

In [14]:
billsum_test = load_dataset("billsum", split="test")
tokenized_billsum_test = billsum_test.map(preprocess_function, batched=True)

Map:   0%|          | 0/3269 [00:00<?, ? examples/s]

In [15]:
trainer.evaluate(tokenized_billsum_test)



{'eval_loss': 1.4963345527648926,
 'eval_rouge1': 0.241,
 'eval_rouge2': 0.1962,
 'eval_rougeL': 0.2333,
 'eval_rougeLsum': 0.2334,
 'eval_gen_len': 18.9997,
 'eval_runtime': 729.3526,
 'eval_samples_per_second': 4.482,
 'eval_steps_per_second': 2.242}

In [16]:
results = trainer.predict(tokenized_billsum_test)

In [17]:
decoded_prediction = tokenizer.batch_decode(results[0], skip_special_tokens=True)


In [23]:
TALib.dump_to_kaggle_format(decoded_prediction , 'full_model_sample_submission.csv')

Unnamed: 0,ID,Predict
0,0,Amends the Water Resources Development Act of ...
1,1,Federal Forage Fee Act of 1993 - Requires all ...
2,2,Merchant Marine of World War II Congressional ...
3,3,Small Business Tax Modernization Act of 2004 -...
4,4,Fair Access to Investment Research Act of 2016...
...,...,...
3264,3264,Public Servant Priority Placement Act of 1995 ...
3265,3265,Sportmanship in Hunting Act of 2008 - Amends t...
3266,3266,Helping College Students Cross the Finish Line...
3267,3267,Texas National Forests Improvement Act of 2000...


### Calculating ROUGE-Lsum with build-in Python function

In [None]:
final_score = TALib.run_score(predict=decoded_prediction,label=billsum_test)

In [None]:
print(final_score)