# Fine-Tuning an LLM for Summarization Task

In [None]:
!pip install transformers datasets evaluate transformers[torch]

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting accelerate>=0.21.0 (from transformers)
  Downloa

# 1. Loading the pre-trained model and tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

# 2. Loading the Dataset

In [None]:
!pip install py7zr

Collecting py7zr
  Downloading py7zr-0.21.0-py3-none-any.whl (67 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.15.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (412 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.3/412.3 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading pyp

In [None]:
# Loading the dataset
from datasets import load_dataset

dataset = load_dataset("samsum")
dataset

Downloading data:   0%|          | 0.00/6.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/335k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

# 3. Trying out the model without Fine-Tuning it

In [None]:
sample = dataset["test"][0]["dialogue"]
label = dataset["test"][0]["summary"]

def generate_summary(input, llm):
  input_prompt = f"""
                  Summarize the following conversation.

                  {input}

                  Summary:
                  """
  input_ids = tokenizer(sample, return_tensors = "pt")
  tokenized_output = llm.generate(input_ids["input_ids"], min_length = 30, max_length = 200)
  output = tokenizer.decode(tokenized_output[0], skip_special_tokens = True)

  return output

output = generate_summary(sample, model)
print("Sample")
print(sample)
print("--------------------")
print("Model Generated Summary:")
print(output)
print("Correct Summary:")
print(label)

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
--------------------
Model Generated Summary:
Hannah: Hey, do you have Betty's number? Amanda: Lemme check. Hannah: Ask Larry. Amanda: He called her last time we were at the park together.
Correct Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


- We can see the base model is not very good at summarizing our text

### Preparing the dataset

In [None]:
def tokenize_inputs(example):
  start_prompt = "Summarize the following conversation.\n\n"
  end_prompt = "\n\nSummary: "
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
  example["input_ids"] = tokenizer(prompt, padding = "max_length", truncation = True, return_tensors = "pt").input_ids
  example["labels"] = tokenizer(example["summary"], padding = "max_length", truncation = True, return_tensors = "pt").input_ids

  return example

tokenizer.pad_token = tokenizer.eos_token # This model requires us to set the token
tokenized_datasets = dataset.map(tokenize_inputs, batched = True)
tokenized_datasets = tokenized_datasets.remove_columns(["id", "dialogue", "summary"]) # We are removing the old columns and only keeping the 2 new ones
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices = True) # Filtering our dataset and keeping only every one hundreth example (just a way to shorten the dataset and not using all the data for speed)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14732 [00:00<?, ? examples/s]

Filter:   0%|          | 0/819 [00:00<?, ? examples/s]

Filter:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets["train"].shape) # Shape of the train dataset
print(tokenized_datasets["validation"].shape) # Shape of the validation dataset
print(tokenized_datasets["test"].shape) # Shape of the test dataset

(148, 2)
(9, 2)
(9, 2)


In [None]:
tokenized_datasets["train"][0].keys()

dict_keys(['input_ids', 'labels'])

- As we can see we now only have the 2 newly added columns

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 4. Fine-Tuning the model

In [None]:
from transformers import Trainer, TrainingArguments

# Customising our training settings
training_args = TrainingArguments(output_dir = "./bart-cnn-samsum-finetuned", # Local directory
                                  hub_model_id = "Kaspemart/bart-cnn-samsum-finetuned", # Identifier on the hub
                                  learning_rate = 1e-5,
                                  num_train_epochs = 1,
                                  weight_decay = 0.01,
                                  auto_find_batch_size = True,
                                  evaluation_strategy = "epoch",
                                  logging_steps = 10)

# Specifying the model, tokenizer, training arguments, training and validation sets
trainer = Trainer(model = model,
                  tokenizer = tokenizer,
                  args = training_args,
                  train_dataset = tokenized_datasets["train"],
                  eval_dataset = tokenized_datasets["validation"])

In [None]:
# Executing the training loop
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1207,0.134328


TrainOutput(global_step=74, training_loss=0.21491804316237167, metrics={'train_runtime': 158.3861, 'train_samples_per_second': 0.934, 'train_steps_per_second': 0.467, 'total_flos': 320731481112576.0, 'train_loss': 0.21491804316237167, 'epoch': 1.0})

In [None]:
# Pushing our model to the Hugging Face hub
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

events.out.tfevents.1709498650.5167fcca9ce6.360.0:   0%|          | 0.00/18.7k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Kaspemart/bart-cnn-samsum-finetuned/commit/5f0ee0256c3f131d3b08a2ea50c8ec5444b87f43', commit_message='End of training', commit_description='', oid='5f0ee0256c3f131d3b08a2ea50c8ec5444b87f43', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Locally saving the model
trainer.save_model("bart-samsum-model")

# 5. Loading and re-testing our model

In [None]:
# Loading my model from the Hugging Face hub repository
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("Kaspemart/bart-cnn-samsum-finetuned")

output = generate_summary(sample, llm = loaded_model)

print("Sample")
print(sample)
print("------------------------")
print("Summary:")
print(output)
print("Ground Truth Summary:")
print(label)

config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/358 [00:00<?, ?B/s]

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
------------------------
Summary:
Hannah asks Amanda for Betty's number. Amanda can't find it, so she asks Larry to call her. Hannah and Larry don't know each other very well.
Ground Truth Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


- We can see now it performed a little bit better but the summary is still partly wrong

# 6. Improving the model further - Parameter Efficient Fine-Tuning (PEFT)

Traditional fine-tuning of pre-trained language models (PLMs) requires updating all of the model's parameters, which is computationally expensive and requires massive amounts of data.

Parameter-Efficient Fine-Tuning (PEFT) works by only updating a small subset of the model's most influential parameters, making it much more efficient.

In [None]:
! pip install peft

Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: peft
Successfully installed peft-0.9.0


### Loading my model and tokenizer again

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Kaspemart/bart-cnn-samsum-finetuned")
model = AutoModelForSeq2SeqLM.from_pretrained("Kaspemart/bart-cnn-samsum-finetuned")

tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/278 [00:00<?, ?B/s]

- **peft** is a library that was created by Hugging Face
- **Lora** is a parameter-efficient fine-tuning (PEFT) technique

In [None]:
# Creating the PEFT model using Lora
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r = 32, # Rank of the matrices that are used in Lora
    lora_alpha = 32,
    lora_dropout = 0.05,
    bias = "none",
    task_type = TaskType.SEQ_2_SEQ_LM # We are telling Lora that this is a sequence to sequence model
)

In [None]:
# Creating the PEFT model
peft_model = get_peft_model(model, peft_config = lora_config)

In [None]:
# Connecting to Hugging Face hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import Trainer, TrainingArguments

# Customising our training settings
peft_training_args = TrainingArguments(output_dir = "./bart-cnn-samsum-peft", # Local directory
                                      hub_model_id = "Kaspemart/bart-cnn-samsum-peft", # Identifier on the hub
                                      learning_rate = 1e-5,
                                      num_train_epochs = 1,
                                      weight_decay = 0.01,
                                      auto_find_batch_size = True,
                                      evaluation_strategy = "epoch",
                                      logging_steps = 10)

# Specifying the model, training arguments, training and validation sets
peft_trainer = Trainer(model = peft_model,
                       args = peft_training_args,
                       train_dataset = tokenized_datasets["train"],
                       eval_dataset = tokenized_datasets["validation"])

In [None]:
# Running the fine-tuning
peft_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1056,0.134742


TrainOutput(global_step=148, training_loss=0.09231781798440057, metrics={'train_runtime': 125.5469, 'train_samples_per_second': 1.179, 'train_steps_per_second': 1.179, 'total_flos': 329414344704000.0, 'train_loss': 0.09231781798440057, 'epoch': 1.0})

### Saving PEFT Adapter

- When you are pushing the PEFT model, you are not pushing the full model, instead you are pushing the PEFT Adapter

- The PEFT Adapter is like an adapter that you plug on top of your model that has been trained and fine-tuned to your data

In [None]:
peft_trainer.push_to_hub()

RuntimeError: 
            Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: [{'base_model.model.model.encoder.embed_tokens.weight', 'base_model.model.model.shared.weight', 'base_model.model.model.decoder.embed_tokens.weight', 'base_model.model.lm_head.weight'}].
            A potential way to correctly save your model is to use `save_model`.
            More information at https://huggingface.co/docs/safetensors/torch_shared_tensors
            

### Reloading the model and testing

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# We firstly have to pull back our base model we created (the fine-tuned model)
tokenizer = AutoTokenizer.from_pretrained("Kaspemart/bart-cnn-samsum-finetuned")
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("Kaspemart/bart-cnn-samsum-finetuned")

# Then we have to attach the PEFT Adapter to the base model
loaded_peft_model = PeftModel.from_pretrained(peft_model_base, "Kaspemart/bart-cnn-samsum-peft", is_trainable = False)

### Trying the PEFT model out

In [None]:
sample = dataset["test"][0]["dialogue"]
label = dataset["test"][0]["summary"]

output = generate_summary(sample, llm = loaded_peft_model)

print("Sample")
print(sample)
print("------------------------")
print("Summary:")
print(output)
print("Ground Truth Summary:")
print(label)

- As we can see there are still some mistakes but it performs a little bit better than before