#Instruction Fine-Tuning Large Language Models for Summarization

In [2]:
!pip install torch torchdata # --quiet
!pip install transformers datasets evaluate rouge_score loralib peft

Collecting torchdata
  Downloading torchdata-0.10.1-py3-none-any.whl.metadata (6.3 kB)
Collecting torch
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Down

In [4]:
!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp310-cp310-linux_x86_64.whl.metadata (27 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (23.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m87.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m875.6/875.6 kB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu11==11.8.87 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_cupti_cu11-11.8.87-py3-none-manylinux1_x86_64.whl (13.1 MB)
[2

#import the main packages

In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

##1.2. Load Dataset and LLM

In [2]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)
dataset

README.md:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/442k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [4]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


##1.3. Test the Model with Zero Shot Inferencing

In [5]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

#2. Perform Full Fine-Tuning

##2.1. Preprocess the Dialog-Summary Dataset

In [6]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [7]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Shapes of the datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


##2.2. Fine-Tune the Model with the Preprocessed Dataset

In [8]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [9]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,49.0


TrainOutput(global_step=1, training_loss=49.0, metrics={'train_runtime': 43.8582, 'train_samples_per_second': 0.182, 'train_steps_per_second': 0.023, 'total_flos': 5478058819584.0, 'train_loss': 49.0, 'epoch': 0.0625})

In [10]:
# Save the trained model
trained_model_dir = "./trained_model"
trainer.save_model(trained_model_dir)

# Load the trained model
trained_model = AutoModelForSeq2SeqLM.from_pretrained(trained_model_dir)

2.3. Evaluate the Model Qualitatively (Human Evaluation)

In [11]:
# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Ensure that input_ids and the models are on the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids = input_ids.to(device)
original_model.to(device)
trained_model.to(device)

# Generate outputs using the original model before training
generation_config = GenerationConfig(max_new_tokens=200, num_beams=1)
original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=generation_config)
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

# Generate outputs using the trained model
trained_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=generation_config)
trained_model_text_output = tokenizer.decode(trained_model_outputs[0], skip_special_tokens=True)


human_baseline_summary = summary
dash_line = '-' * 50  # Assuming dash_line is a line separator
print(dash_line)
print(f'BASELINE HUMAN SUMMARY: \n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL: \n{original_model_text_output}')
print(dash_line)
print(f'TRAINED MODEL: \n{trained_model_text_output}')

--------------------------------------------------
BASELINE HUMAN SUMMARY: 
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
--------------------------------------------------
ORIGINAL MODEL: 
##Person1: Have you considered upgrading your system? ##Person2: Yes, but you might want to add a painting program to your software. ##Person1: I'd like to make up my own flyers and banners. ##Person2: I'd like to make up my own flyers and banners. ##Person1: I'd like to make up my own flyers and banners. ##Person2: I'd like to make up my own flyers and banners. ##Person1: I'd like to make up my own flyers and banners. ##Person2: I'd like to make up my own flyers and banners. ##Person1: I'd like to make up my own flyers and banners. ##Person2: I'd like to make up my own flyers and
--------------------------------------------------
TRAINED MODEL: 
#Person1#: I'm thinking of upgrading my computer.


2.4. Evaluate the Model Quantitatively (with ROUGE Metric)

In [12]:
rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [13]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']
original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
  prompt = f"""
  summarize the following conversation
  {dialogue}
  Summary:

  """
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
  # Ensure that input_ids and the models are on the same device
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  input_ids = input_ids.to(device)

  original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
  original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
  original_model_summaries.append(original_model_text_output)

  instruct_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
  instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
  instruct_model_summaries.append(instruct_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))
df = pd.DataFrame(zipped_summaries, columns=['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,All employees are required to report to the of...,Employees are required to use instant messaging.
1,In order to prevent employees from wasting tim...,#Person1#: I need to take a dictation.,Employees are required to use instant messaging.
2,Ms. Dawson takes a dictation for #Person1# abo...,Employees are required to use instant messaging.,Employees are required to use instant messaging.
3,#Person2# arrives late because of traffic jam....,People are complaining about the congestion in...,The traffic jam at the Carrefour intersection ...
4,#Person2# decides to follow #Person1#'s sugges...,#Person1#: I'm still stuck in traffic. I'm not...,The traffic jam at the Carrefour intersection ...
5,#Person2# complains to #Person1# about the tra...,#Person1: I got stuck in traffic again. #Perso...,The traffic jam at the Carrefour intersection ...
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,#Porn2: What are they doing?,Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,#Person1#: #Person2#: #Person3#: #Person4: #Pe...,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,#Person1: Your birthday is coming. #Person2: I...,Brian's birthday is coming up.


In [15]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [16]:
from rouge import Rouge
rouge = Rouge()


original_model_results = rouge.get_scores(
    original_model_summaries,
    human_baseline_summaries[0:len(original_model_summaries)],
    )

instruct_model_results = rouge.get_scores(
    instruct_model_summaries,
    human_baseline_summaries[0:len(instruct_model_summaries)],
    )

print('Original Model:')
print(original_model_results)

print('Instruct Model:')
print(instruct_model_results)

Original Model:
[{'rouge-1': {'r': 0.08, 'p': 0.2, 'f': 0.11428571020408178}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.08, 'p': 0.2, 'f': 0.11428571020408178}}, {'rouge-1': {'r': 0.06451612903225806, 'p': 0.2857142857142857, 'f': 0.10526315488919677}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.06451612903225806, 'p': 0.2857142857142857, 'f': 0.10526315488919677}}, {'rouge-1': {'r': 0.041666666666666664, 'p': 0.14285714285714285, 'f': 0.0645161255359003}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.041666666666666664, 'p': 0.14285714285714285, 'f': 0.0645161255359003}}, {'rouge-1': {'r': 0.05263157894736842, 'p': 0.125, 'f': 0.07407406990397829}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.05263157894736842, 'p': 0.125, 'f': 0.07407406990397829}}, {'rouge-1': {'r': 0.0625, 'p': 0.08333333333333333, 'f': 0.0714285665306126}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.0625, 'p': 0.08333333333333

#3. Perform Parameter Efficient Fine-Tuning (PEFT)

##3.1. Setup the PEFT/LoRA model for Fine-Tuning

In [17]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)


In [18]:
peft_model = get_peft_model(original_model,
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


##3.2. Train PEFT Adapter

In [19]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
    )

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets['train'],
    )

In [20]:
peft_trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Step,Training Loss
1,49.75


('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/spiece.model',
 './peft-dialogue-summary-checkpoint-local/added_tokens.json',
 './peft-dialogue-summary-checkpoint-local/tokenizer.json')

In [22]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
peft_model = PeftModel.from_pretrained(peft_model_base,
                                       peft_model_path,
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

##3.3. Evaluate the Model Qualitatively (Human Evaluation)

In [23]:
index = 200
dialogue = dataset['test'][index]['dialogue']
base_line_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Ensure that input_ids and the models are on the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids = input_ids.to(device)
original_model.to(device)
trained_model.to(device)
peft_model.to(device)

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
print(original_model_text_output)

instruct_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY: \n{base_line_human_summary}')
print(dash_line)
print(f'ORIGINAL MODEL: \n{original_model_text_output}')
print(dash_line)
print(f'TRAINED MODEL: \n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: \n{peft_model_text_output}')

#Person1: You're considering upgrading your system.
--------------------------------------------------
BASELINE HUMAN SUMMARY: 
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
--------------------------------------------------
ORIGINAL MODEL: 
#Person1: You're considering upgrading your system.
--------------------------------------------------
TRAINED MODEL: 
#Person1#: I'm thinking of upgrading my computer.
--------------------------------------------------
PEFT MODEL: 
#Person1#: I'm thinking of upgrading my computer.


##3.4. Evaluate the Model Quantitatively (with ROUGE Metric)

In [24]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']
original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
  prompt = f"""
  summarize the following conversation
  {dialogue}
  Summary:

  """
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids

  # Ensure that input_ids and the models are on the same device
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  input_ids = input_ids.to(device)

  human_baseline_text_output = human_baseline_summaries[idx]

  original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
  original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
  original_model_summaries.append(original_model_text_output)

  instruct_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
  instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
  instruct_model_summaries.append(instruct_model_text_output)

  peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
  peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
  peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns=['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,Memo to all employees.,Employees who use Instant Messages will be pla...
1,In order to prevent employees from wasting tim...,Employees are being asked to take a dictation ...,#Person1#: I need you to take a dictation for me.
2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1#: I need you to take a dictation for ...,Employees should be given an intra-office memo.
3,#Person2# arrives late because of traffic jam....,...,#Person1: I'm finally here. #Person2: I'm just...
4,#Person2# decides to follow #Person1#'s sugges...,People are complaining about the traffic in th...,The driver of the car isn't sure if it's a goo...
5,#Person2# complains to #Person1# about the tra...,#Person1: I'm here! #Person2: I'm here! #Perso...,#Pretty bad traffic james. #Person1#: I'm not ...
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are getting divorced.,Masha and Hero are getting divorce.
7,#Person1# tells Kate that Masha and Hero are g...,#Person1: Masha and Hero are divorced. #Person...,#Person1: #Person2: What do you mean? #Person1...
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,Brian's birthday is today.,"Brian, thank you for a great party."


In [25]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.2092895988931674, 'rouge2': 0.07296060616308389, 'rougeL': 0.17692409156611866, 'rougeLsum': 0.1797396022146629}
INSTRUCT MODEL:
{'rouge1': 0.22330533696695526, 'rouge2': 0.06834104656355296, 'rougeL': 0.18610786455047496, 'rougeLsum': 0.186560620357268}
PEFT MODEL:
{'rouge1': 0.30672486010310906, 'rouge2': 0.11452117888715811, 'rougeL': 0.2590989701386418, 'rougeLsum': 0.26179696695086574}


In [26]:
print("Absolute percentage improvement of PEFT MODEL over HUMAN BASELINE")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of PEFT MODEL over HUMAN BASELINE
rouge1: 9.74%
rouge2: 4.16%
rougeL: 8.22%
rougeLsum: 8.21%
