# Finetune 6 selected models with PubMed dataset and save on HuggingFace

The names used to save fine-tuned models on HF:

short-form models:
- bart-pubmed-20k (original: facebook/bart-large-cnn)

- distilbart-pubmed-20k (original: philschmid/distilbart-cnn-12-6-samsum)

- pegasus-pubmed-20k (original: tuner007/pegasus_summarizer)

long-form models:

- primera-pubmed-20k (original: allenai/PRIMERA)

- led-pubmed-20k (original: pszemraj/led-base-book-summary)

- longformer-pubmed-20k (original: hyesunyun/update-summarization-bart-large-longformer)

models finetuned on bottom-truncated dataset:

- bart-pubmed-20k-bottom-tokens

- distilbart-pubmed-20k-bottom-tokens

- pegasus-pubmed-20k-bottom-tokens

---
** All the datasets, generated summaries, and evaluation results can be found in my shared Google Drive folder: https://drive.google.com/drive/folders/1sNoJxaShjifrt_AqyG5_sZYGxHknqfOM?usp=sharing



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
scaledown = 20000
article_max_len = 4096
abstract_max_len = 400 # rough max of dataset

## !! Replace folder name and hg model name with the model names and original HG name listed above
folder_name = "led-pubmed-20k"
huggingface_model = "pszemraj/led-base-book-summary"

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install wandb



In [None]:
import wandb
wandb.init(project="FineTune-TextSummarize", name=folder_name)

[34m[1mwandb[0m: Currently logged in as: [33mairolg1111[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
!pip install ipython-autotime
%load_ext autotime
!pip install datasets
!pip install accelerate -U

time: 698 µs (started: 2024-04-30 18:35:29 +00:00)


# Load dataset

I've stored the selected dataset from PubMed on my drive (which can be reproduced by running the bart finetuning notebook) or access from my shared folder : https://drive.google.com/drive/folders/1sNoJxaShjifrt_AqyG5_sZYGxHknqfOM?usp=sharing

In [None]:
import torch
import matplotlib.pyplot as plt
from transformers import pipeline
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset

time: 8.52 s (started: 2024-04-30 18:36:31 +00:00)


In [None]:
# Load the CSV file into a DataFrame
token_df = pd.read_csv('/content/drive/My Drive/pubmed/token_df.csv')
token_df_test = pd.read_csv('/content/drive/My Drive/pubmed/token_df_test.csv')
token_df_val = pd.read_csv('/content/drive/My Drive/pubmed/token_df_val.csv')

token1024_df = pd.read_csv('/content/drive/My Drive/pubmed/token1024_df.csv')
token1024_df_test = pd.read_csv('/content/drive/My Drive/pubmed/token1024_df_test.csv')
token1024_df_val = pd.read_csv('/content/drive/My Drive/pubmed/token1024_df_val.csv')

# Convert DataFrames to Hugging Face Datasets
dataset_train = Dataset.from_pandas(token_df)
dataset_test = Dataset.from_pandas(token_df_test)
dataset_val = Dataset.from_pandas(token_df_val)

# Create DatasetDict
dataset_dict = DatasetDict({
    'train': dataset_train,
    'test': dataset_test,
    'validation': dataset_val
})

dataset_med = dataset_dict
dataset_med

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'article_len', 'abstract_len', 'article', 'abstract', 'bottom_article'],
        num_rows: 10700
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'article_len', 'abstract_len', 'article', 'abstract', 'bottom_article'],
        num_rows: 1125
    })
    validation: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'article', 'abstract', 'bottom_article'],
        num_rows: 1107
    })
})

time: 10.5 s (started: 2024-04-30 18:36:39 +00:00)


In [None]:
# Convert DataFrames to Hugging Face Datasets
dataset1024_train = Dataset.from_pandas(token1024_df)
dataset1024_test = Dataset.from_pandas(token1024_df_test)
dataset1024_val = Dataset.from_pandas(token1024_df_val)

# Create DatasetDict
dataset1024_dict = DatasetDict({
    'train': dataset1024_train,
    'test': dataset1024_test,
    'validation': dataset1024_val
})

dataset1024_med = dataset1024_dict
dataset1024_med

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'article', 'abstract'],
        num_rows: 642
    })
    test: Dataset({
        features: ['Unnamed: 0', 'article', 'abstract'],
        num_rows: 74
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'article', 'abstract'],
        num_rows: 56
    })
})

time: 33.3 ms (started: 2024-04-30 18:36:50 +00:00)


# Preparing dataset

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu" # Shift all computations to GPU

time: 48.7 ms (started: 2024-04-30 18:37:32 +00:00)


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = huggingface_model
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

time: 955 ms (started: 2024-04-30 18:37:32 +00:00)


# Fine-Tuning pretrained model
(first with 4096 tokens)

In [None]:
# tokenize the dataset

def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["article"], max_length=article_max_len, padding="max_length", truncation=True)

    #Using target_tokenizer for summaries
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["abstract"], max_length=abstract_max_len, padding="max_length", truncation=True)

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }

time: 1.32 ms (started: 2024-04-30 18:38:45 +00:00)


In [None]:
dataset_med_pt = dataset_med.map(convert_examples_to_features, batched=True)

Map:   0%|          | 0/10700 [00:00<?, ? examples/s]



Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

Map:   0%|          | 0/1107 [00:00<?, ? examples/s]

time: 36.9 s (started: 2024-04-30 18:38:45 +00:00)


In [None]:
columns = ["input_ids", "labels", "attention_mask"]
dataset_med_pt.set_format(type="torch", columns=columns)

time: 2.15 ms (started: 2024-04-30 18:39:22 +00:00)


In [None]:
from transformers import DataCollatorForSeq2Seq
# Collator for Handling length imbalances and attention masks
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

time: 931 µs (started: 2024-04-30 18:39:22 +00:00)


In [None]:
from transformers import TrainingArguments, Trainer
# debug: https://discuss.huggingface.co/t/trainingargument-does-not-work-on-colab/43372
training_args = TrainingArguments( output_dir= folder_name,
                                 num_train_epochs=1,
                                 warmup_steps=500,
                                 per_device_train_batch_size=1,
                                 per_device_eval_batch_size=1,
                                 weight_decay=0.01,
                                 logging_steps=10,
                                 push_to_hub=True,
                                 evaluation_strategy='steps',
                                 eval_steps=500,
                                 save_steps=1e6,
                                 gradient_accumulation_steps=16,
                                 report_to="wandb" )

time: 32.1 ms (started: 2024-04-30 18:39:22 +00:00)


In [None]:
trainer = Trainer(model=model,
                 args=training_args,
                 tokenizer=tokenizer,
                 data_collator=seq2seq_data_collator,
                 train_dataset=dataset_med_pt["train"],
                 eval_dataset=dataset_med_pt["validation"])

time: 124 ms (started: 2024-04-30 18:39:22 +00:00)


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
500,1.2197,1.106588


TrainOutput(global_step=668, training_loss=2.1758214499422177, metrics={'train_runtime': 5144.1821, 'train_samples_per_second': 2.08, 'train_steps_per_second': 0.13, 'total_flos': 2.885978927451341e+16, 'train_loss': 2.1758214499422177, 'epoch': 0.9988785046728972})

time: 1h 25min 44s (started: 2024-04-30 18:39:22 +00:00)


# Store finetuned models

In [None]:
commit_msg = "Training done for " + folder_name
trainer.push_to_hub(commit_msg)

time: 25.7 s (started: 2024-04-30 20:05:07 +00:00)
