# Fine-Tune Flan-T5 for Harmony with Nature using H4rmony

<a name='1'></a>
## 1 - Set up Kernel, Load Required Dependencies, Dataset and LLM

In [1]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

%pip install scikit-learn

%pip install bert_score

Collecting pip
  Downloading pip-23.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.2.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m4.9 MB/s[0m eta [36m0:00:0

In [2]:
# Import necessary libraries
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import os
import warnings
warnings.filterwarnings("ignore")

### 1.1 - Load and Split Dataset, then load the LLM

The  [H4rmony](https://huggingface.co/datasets/neovalle/H4rmony) dataset ontains ~ 1500 prompts and completions. We need only unique prompts, which are about 500.

In [3]:
# We need to select from the dataset only one versions of the better completions (R1)
# therefore we filter by ComparedRanks = 'R1-R2'.
# After filtering we separate the dataset into training and test
# finally we merge them back into a DatasetDict to keep it compatible with HF libraries.

# Load the dataset
original_dataset = load_dataset('neovalle/H4rmony', download_mode='force_redownload')

# Filter rows based on the specified column and value
filtered_dataset = original_dataset['train'].filter(lambda example: example['ComparedRanks'] == 'R1-R2')

# Split the filtered dataset into train and test sets
train_data, test_data = train_test_split(filtered_dataset, test_size=0.2, random_state=42)

# Create a new dataset manually with column names and features
column_names = original_dataset['train'].column_names
features = original_dataset['train'].features
filtered_train_data = Dataset.from_dict({name: train_data[name] for name in column_names}, features=features)
filtered_test_data = Dataset.from_dict({name: test_data[name] for name in column_names}, features=features)

# Create a new dataset dictionary with filtered train and test sets
filtered_dataset_dict = DatasetDict({
    'train': filtered_train_data,
    'test': filtered_test_data
})


Downloading readme:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

Downloading and preparing dataset csv/neovalle--H4rmony to /root/.cache/huggingface/datasets/neovalle___csv/neovalle--H4rmony-1d556271aea5c345/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/602k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/neovalle___csv/neovalle--H4rmony-1d556271aea5c345/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Filter:   0%|          | 0/1571 [00:00<?, ? examples/s]

In [4]:
#check datasetdict
filtered_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['PromptID', 'Prompt', 'BetterCompletion', 'WorseCompletion', 'Reward', 'CognitiveStructure', 'Type', 'Language', 'PromptOriginator', 'BetterCompletionOrigin', 'WorseCompletionOrigin', 'ComparedRanks', 'Contributor', 'Comments'],
        num_rows: 413
    })
    test: Dataset({
        features: ['PromptID', 'Prompt', 'BetterCompletion', 'WorseCompletion', 'Reward', 'CognitiveStructure', 'Type', 'Language', 'PromptOriginator', 'BetterCompletionOrigin', 'WorseCompletionOrigin', 'ComparedRanks', 'Contributor', 'Comments'],
        num_rows: 104
    })
})

In [5]:
# Load the pre-trained FLAN-T5 model and its tokenizer directly from HuggingFace. We are using the large version (https://huggingface.co/google/flan-t5-large) of FLAN-T5.
model_name='google/flan-t5-large'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) # param torch_dtype is the memory type to be used
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [6]:
# This function will show the number of trainable parameters in the mode
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 783150080
all model parameters: 783150080
percentage of trainable model parameters: 100.00%


In [7]:
# Test the Model with Zero Shot Inferencing for one prompt, just to check is all good so far.

# set prompt
prompt = "I'm scared of wasps, what pesticide can use on them"

# wrap it in instruction
prompt = f"""
Complete the following prompt.

{prompt}

Completion:
"""

# tokenize the prompt
inputs = tokenizer(prompt, return_tensors='pt')

# decode the output of the model usind the tokenized input
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

# create a separator for display purposes
dash_line = '-'.join('' for x in range(100))

#
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Complete the following prompt.

I'm scared of wasps, what pesticide can use on them

Completion:

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
pyrethrum


## 2 - Perform Full Fine-Tuning

In [8]:
# function to tokenize the prompts wrapped on instructions
def tokenize_function(example):

    # wrap prompts
    start_prompt = 'Complete the following prompt.\n\n'
    end_prompt = '\n\nCompletion: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["Prompt"]]

    # tokenize prompt an completion  (BetterCompletion is the preferred answer)
    example['input_ids'] = tokenizer(prompt, padding='longest', truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["BetterCompletion"], padding='longest', truncation=True, return_tensors="pt").input_ids

    return example

# The tokenize_function code is handling all data across all splits, in batches.
tokenized_dataset = filtered_dataset_dict.map(tokenize_function, batched=True)

# Remove all columns, leaving only inputs_ids (tokenized prompts) and labels (tokenized completions)

cols_to_keep = ['input_ids','labels']

for split in tokenized_dataset.keys():
    tokenized_dataset[split] = tokenized_dataset[split].remove_columns([col for col in tokenized_dataset[split].column_names if col not in cols_to_keep])


Map:   0%|          | 0/413 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [9]:
# check the structure tokenized_dataset
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 413
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 104
    })
})

In [10]:
# check the shapes to make sure
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_dataset['train'].shape}")
print(f"Validation: {tokenized_dataset['test'].shape}")


Shapes of the datasets:
Training: (413, 2)
Validation: (104, 2)


In [11]:
# The model is ready to fine-tune with the H4rmony Dataset, using Hugging Face Trainer class.
# We'll use Hugging Face Trainer class, passing the preprocessed dataset, with reference, to the original model


# save the original model to later compare
original_model_dir = "./original_model"
original_model.save_pretrained(original_model_dir)
tokenizer.save_pretrained(original_model_dir)

# set the output directory for the fine-tuned model, config, etc
output_dir = f'./H4rmony_Training-{str(int(time.time()))}'

# hyperparameters to adjust as required, current values only for illustration. The current values work relatively well and fast with the free Colab T4 GPU.
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-3,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=100
)

# instantiate the trainer class
trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test']
)

In [12]:
# this can take long time, depending on the parameters, as it is the actual training
trainer.train()

Step,Training Loss
1,30.25
2,21.5
3,36.0
4,25.5
5,24.375
6,20.25
7,15.1875
8,9.625
9,5.8438
10,5.125


TrainOutput(global_step=100, training_loss=2.74173828125, metrics={'train_runtime': 140.5594, 'train_samples_per_second': 5.692, 'train_steps_per_second': 0.711, 'total_flos': 226024811679744.0, 'train_loss': 2.74173828125, 'epoch': 1.92})

In [13]:
# Save the trained model and tokenizer to the output directory
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

('./H4rmony_Training-1696793922/tokenizer_config.json',
 './H4rmony_Training-1696793922/special_tokens_map.json',
 './H4rmony_Training-1696793922/tokenizer.json')

In [14]:
# load the trained model and the original model to make sure we can compare them
trained_model = AutoModelForSeq2SeqLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16)
original_model = AutoModelForSeq2SeqLM.from_pretrained(original_model_dir, torch_dtype=torch.bfloat16)

### 2.1 - Evaluate the Model Qualitatively

In [15]:
# Define the file path for the output file
output_file_path = os.path.join(output_dir, "TestResults.txt")

# move the model to cpu for inference
original_model.to('cpu')

# Open the file for writing
with open(output_file_path, "w") as output_file:
    # Loop through the test dataset
    #for index in range(tokenized_dataset['test'].shape[0]):
    for index in range(5):
        raw_prompt = filtered_dataset_dict['test'][index]['Prompt']
        human_baseline_completion = filtered_dataset_dict['test'][index]['BetterCompletion']
        prompt = f"""
        Complete the following prompt.

        {raw_prompt}

        Completion:
        """

        input_ids = tokenizer(prompt, return_tensors="pt").input_ids

        original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=100, num_beams=1))
        original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

        trained_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=100, num_beams=1))
        trained_model_text_output = tokenizer.decode(trained_model_outputs[0], skip_special_tokens=True)

        # Write the results to the output file
        output_file.write("BASELINE Prompt:\n" + raw_prompt + "\n")
        output_file.write("BASELINE Completion:\n" + human_baseline_completion + "\n")
        output_file.write("ORIGINAL MODEL:\n" + original_model_text_output + "\n")
        output_file.write("TRAINED MODEL:\n" + trained_model_text_output + "\n")
        output_file.write("\n" + "-"*80 + "\n")

# Print a message indicating where the file was saved
print(f"Results saved to: {output_file_path}")

Results saved to: ./H4rmony_Training-1696793922/TestResults.txt


In [16]:
    # move the model to cpu for inference
    original_model.to('cpu')

    # re-try the orinal test prompt
    raw_prompt = "I'm scared of wasps, what pesticide can use on them"

    prompt = f"""
    Complete the following prompt.

    {raw_prompt}

    Completion:
    """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=100, num_beams=1))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    trained_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=100, num_beams=1))
    trained_model_text_output = tokenizer.decode(trained_model_outputs[0], skip_special_tokens=True)


    print(dash_line)
    print(f'BASELINE Prompt:\n{raw_prompt}')
    print(dash_line)
    print(f'ORIGINAL MODEL:\n{original_model_text_output}')
    print(dash_line)
    print(f'TRAINED MODEL:\n{trained_model_text_output}')



---------------------------------------------------------------------------------------------------
BASELINE Prompt:
I'm scared of wasps, what pesticide can use on them
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
pyrethrum
---------------------------------------------------------------------------------------------------
TRAINED MODEL:
Pesticides are a threat to the environment, causing harm to the environment and harming the health of the animals.


In [17]:
    # let's test for catastrophic forgetting
    # move the model to cpu for inference
    #original_model.to('cpu')
    raw_prompt = "How long would a train from London to Paris take?"

    prompt = f"""
    Complete the following prompt.

    {raw_prompt}

    Completion:
    """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=100, num_beams=1))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    trained_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=100, num_beams=1))
    trained_model_text_output = tokenizer.decode(trained_model_outputs[0], skip_special_tokens=True)


    print(dash_line)
    print(f'BASELINE Prompt:\n{raw_prompt}')
    print(dash_line)
    print(f'ORIGINAL MODEL:\n{original_model_text_output}')
    print(dash_line)
    print(f'TRAINED MODEL:\n{trained_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE Prompt:
How long would a train from London to Paris take?
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Approximately 3.5 hours
---------------------------------------------------------------------------------------------------
TRAINED MODEL:
A train from London to Paris would take about a day, but a train from London to Paris would take about a week.


### 2.2 - Evaluate the Model Quantitatively (with Bert Score Metric)

In [18]:
# we'll use bert_score but we can try many others
bertscore = evaluate.load('bertscore')

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [20]:
# try 20 test prompts for the evaluation
raw_prompts = filtered_dataset_dict['test'][0:20]['Prompt']
human_baseline_completions = filtered_dataset_dict['test'][0:20]['BetterCompletion']

original_model_completions = []
trained_model_completions = []

for _, raw_prompt in enumerate(raw_prompts):
    prompt = f"""
Complete the following prompt.

{raw_prompt}

Completion: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_completions.append(original_model_text_output)

    trained_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    trained_model_text_output = tokenizer.decode(trained_model_outputs[0], skip_special_tokens=True)
    trained_model_completions.append(trained_model_text_output)

zipped_completions = list(zip(human_baseline_completions, original_model_completions, trained_model_completions))

# save them for qualitative control
output_test_path = os.path.join(output_dir, "TestResultsScored.txt")
df = pd.DataFrame(zipped_completions, columns = ['human_baseline_completions', 'original_model_completions', 'trained_model_comnpletions'])
df.to_csv(output_test_path)

In [21]:
original_model_results = bertscore.compute(
    predictions=original_model_completions,
    references=human_baseline_completions[0:len(original_model_completions)], lang='en'
)

trained_model_results = bertscore.compute(
    predictions=trained_model_completions,
    references=human_baseline_completions[0:len(trained_model_completions)], lang='en'

)

print('ORIGINAL MODEL PRECISION, RECALL:')
print(np.mean(original_model_results['precision']),np.mean(original_model_results['recall']))
print('TRAINED MODEL PRECISION, RECALL:')
print(np.mean(trained_model_results['precision']),np.mean(trained_model_results['recall']))

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

ORIGINAL MODEL PRECISION, RECALL:
0.8707379758358001 0.8543050557374954
TRAINED MODEL PRECISION, RECALL:
0.8878391712903977 0.8804276943206787


In [22]:
## Bert Score improvement

print("Absolute percentage improvement of TRAINED MODEL over ORIGINAL MODEL\n")

precision_improvement = np.array(trained_model_results['precision']) / np.array(original_model_results['precision'])
recall_improvement = np.array(trained_model_results['recall']) / np.array(original_model_results['recall'])

print(f'% Precision Improvement {(np.mean(precision_improvement)-1)*100:.2f}%')
print(f'% Recall Improvement {(np.mean(recall_improvement)-1)*100:.2f}%')

Absolute percentage improvement of TRAINED MODEL over ORIGINAL MODEL

% Precision Improvement 2.24%
% Recall Improvement 3.08%


## 3 - Perform Parameter Efficient Fine-Tuning (PEFT) **** TO DO ****************

In [23]:
#Setup the PEFT/LoRA model for Fine-Tuning#

from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [24]:
#Add LoRA adapter layers/parameters to the original LLM to be trained.
peft_model = get_peft_model(original_model,
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 9437184
all model parameters: 792587264
percentage of trainable model parameters: 1.19%


### 3.1 - Train PEFT Adapter

In [25]:
output_dir = f'./peft-H4rmony-training-{str(int(time.time()))}'

# hyperparameters
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-2, # Higher learning rate than full fine-tuning.
    num_train_epochs=5,
    logging_steps=1,
    max_steps=100
)

# instantiate trainer
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_dataset["train"],
)

In [26]:
# trainer - might take long time to run depending on parameters

peft_trainer.train()

peft_model_path="./peft-H4rmony-completions"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Step,Training Loss
1,30.25
2,25.625
3,17.75
4,19.875
5,16.125
6,8.0
7,9.4375
8,11.3125
9,8.5625
10,9.625


('./peft-H4rmony-completions/tokenizer_config.json',
 './peft-H4rmony-completions/special_tokens_map.json',
 './peft-H4rmony-completions/tokenizer.json')

In [27]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       peft_model_path,
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False) # for inference

In [28]:
# The number of trainable parameters will be 0 due to is_trainable=False setting:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 0
all model parameters: 792587264
percentage of trainable model parameters: 0.00%


### 3.2 - Evaluate the Model Qualitatively (Human Evaluation)

In [29]:

raw_prompt = "I'm scared of wasps, what pesticide can use on them?"

prompt = f"""
Complete the following prompt:

{raw_prompt}

Completion: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model.to("cpu")

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

trained_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
trained_model_text_output = tokenizer.decode(trained_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)


print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'TRAINED MODEL:\n{trained_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
’ and andally andancyally throw,
---------------------------------------------------------------------------------------------------
TRAINED MODEL:
Pesticides are a powerful tool for reducing the spread of diseases and pests.
---------------------------------------------------------------------------------------------------
PEFT MODEL: 


In [30]:
# check for catastrophic forgetting
raw_prompt = filtered_dataset_dict['test'][index]['Prompt']
human_baseline_completion = filtered_dataset_dict['test'][index]['BetterCompletion']
raw_prompt = "How long would a train from London to Paris take?"

prompt = f"""
Complete the following prompt:

{raw_prompt}

Completion: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model.to("cpu")
original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

trained_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
trained_model_text_output = tokenizer.decode(trained_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN COMPLETION:\n{human_baseline_completion}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'TRAINED MODEL:\n{trained_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN COMPLETION:
Multiple international airports in a city should adopt green practices, reducing their carbon footprint and promoting sustainable travel.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
,s s,s and
---------------------------------------------------------------------------------------------------
TRAINED MODEL:
A train from London to Paris would take about a day, but a train from London to Paris would take about a week.
---------------------------------------------------------------------------------------------------
PEFT MODEL: 


### 3.3 - Evaluate the Model Quantitatively (with Metric)
Perform inferences for the sample of the test dataset (only 10 prompt and completions to save time).

In [None]:
raw_prompts = filtered_dataset_dict['test'][0:10]['Prompt']
human_baseline_completions = filtered_dataset_dict['test'][0:10]['BetterCompletion']

original_model_comnpletions = []
instruct_model_completions = []
peft_model_completions = []

for idx, raw_prompt in enumerate(raw_prompts):
    prompt = f"""
Complete the following prompt:

{raw_prompt}

Completion: """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    human_baseline_text_output = human_baseline_completions[idx]

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    trained_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    trained_model_text_output = tokenizer.decode(trained_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_completions.append(original_model_text_output)
    trained_model_completions.append(trained_model_text_output)
    peft_model_completions.append(peft_model_text_output)

zipped_completions = list(zip(human_baseline_completions, original_model_completions, trained_model_completions, peft_model_completions))

df = pd.DataFrame(zipped_completions, columns = ['human_baseline_completions', 'original_model_completions', 'trained_model_completions', 'peft_model_completions'])
df

In [None]:
# Compute score for the subset of the data.
bertscore = evaluate.load('bertscore')

original_model_results = bertscore.compute(
    predictions=original_model_completions[0:len(peft_model_completions)],
    references=human_baseline_completions[0:len(peft_model_completions)], lang='en',
)

trained_model_results = bertscore.compute(
    predictions=trained_model_completions[0:len(peft_model_completions)],
    references=human_baseline_completions[0:len(peft_model_completions)], lang='en'

)

peft_model_results = bertscore.compute(
    predictions=peft_model_completions[0:len(peft_model_completions)],
    references=human_baseline_completions[0:len(peft_model_completions)], lang='en'

)

print('ORIGINAL MODEL PRECISION, RECALL:')
print(np.mean(original_model_results['precision']),np.mean(original_model_results['recall']))
print('TRAINED MODEL PRECISION, RECALL:')
print(np.mean(trained_model_results['precision']),np.mean(trained_model_results['recall']))
print('PEFT MODEL PRECISION, RECALL:')
print(np.mean(peft_model_results['precision']),np.mean(peft_model_results['recall']))

In [None]:
print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL\n")


precision_improvement = np.array(peft_model_results['precision']) / np.array(original_model_results['precision'])
recall_improvement = np.array(peft_model_results['recall']) / np.array(original_model_results['recall'])

print(f'% Precision Improvement {np.mean(precision_improvement)-1:.2f}%')
print(f'% Recall Improvement {np.mean(recall_improvement)-1:.2f}%')

In [None]:
print("Absolute percentage improvement of PEFT MODEL overt TRAINED MODEL")

precision_improvement = np.array(peft_model_results['precision']) / np.array(trained_model_results['precision'])
recall_improvement = np.array(peft_model_results['recall']) / np.array(trained_model_results['recall'])

print(f'% Precision Improvement {np.mean(precision_improvement)-1:.2f}%')
print(f'% Recall Improvement {np.mean(recall_improvement)-1:.2f}%')