# PEFT Fine-Tuning Flan-T5 for Harmony with Nature using H4rmony

In [1]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

%pip install scikit-learn --quiet

%pip install bert_score --quiet

Collecting pip
  Downloading pip-23.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.2.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m101.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m4.5 MB/s[0m eta [36m0:00:

In [2]:
# import necessary libraries
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import os
import warnings
warnings.filterwarnings("ignore")

In [3]:
# We need to select from the dataset only one versions of the better completions (R1)
# therefore we filter by ComparedRanks = 'R1-R2'.
# After filtering we separate the dataset into training and test
# finally we merge them back into a DatasetDict to keep it compatible with HF libraries.

# Load the dataset
original_dataset = load_dataset('neovalle/H4rmony', download_mode='force_redownload')

# Filter rows based on the specified column and value
filtered_dataset = original_dataset['train'].filter(lambda example: example['ComparedRanks'] == 'R1-R2')

# Split the filtered dataset into train and test sets
train_data, test_data = train_test_split(filtered_dataset, test_size=0.2, random_state=42)

# Create a new dataset manually with column names and features
column_names = original_dataset['train'].column_names
features = original_dataset['train'].features
filtered_train_data = Dataset.from_dict({name: train_data[name] for name in column_names}, features=features)
filtered_test_data = Dataset.from_dict({name: test_data[name] for name in column_names}, features=features)

# Create a new dataset dictionary with filtered train and test sets
filtered_dataset_dict = DatasetDict({
    'train': filtered_train_data,
    'test': filtered_test_data
})


Downloading readme:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

Downloading and preparing dataset csv/neovalle--H4rmony to /root/.cache/huggingface/datasets/neovalle___csv/neovalle--H4rmony-1d556271aea5c345/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/602k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/neovalle___csv/neovalle--H4rmony-1d556271aea5c345/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Filter:   0%|          | 0/1571 [00:00<?, ? examples/s]

In [4]:
#check datasetdict
filtered_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['PromptID', 'Prompt', 'BetterCompletion', 'WorseCompletion', 'Reward', 'CognitiveStructure', 'Type', 'Language', 'PromptOriginator', 'BetterCompletionOrigin', 'WorseCompletionOrigin', 'ComparedRanks', 'Contributor', 'Comments'],
        num_rows: 413
    })
    test: Dataset({
        features: ['PromptID', 'Prompt', 'BetterCompletion', 'WorseCompletion', 'Reward', 'CognitiveStructure', 'Type', 'Language', 'PromptOriginator', 'BetterCompletionOrigin', 'WorseCompletionOrigin', 'ComparedRanks', 'Contributor', 'Comments'],
        num_rows: 104
    })
})

In [5]:
# Load the pre-trained FLAN-T5 model and its tokenizer directly from HuggingFace. We are using the large version (https://huggingface.co/google/flan-t5-large) of FLAN-T5.
model_name='google/flan-t5-large'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) # param torch_dtype is the memory type to be used
tokenizer = AutoTokenizer.from_pretrained(model_name)

(…)e/flan-t5-large/resolve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

(…)arge/resolve/main/generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

(…)large/resolve/main/tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

(…)lan-t5-large/resolve/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

(…)rge/resolve/main/special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [6]:
# Test the Model with Zero Shot Inferencing for one prompt, just to check is all good so far.

# set prompt
prompt = "I'm scared of wasps, what pesticide can use on them?"

# wrap it in instruction
prompt = f"""
Complete the following prompt.

{prompt}

Completion:
"""

# tokenize the prompt
inputs = tokenizer(prompt, return_tensors='pt')

# decode the output of the model usind the tokenized input
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

# create a separator for display purposes
dash_line = '-'.join('' for x in range(100))

#
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Complete the following prompt.

I'm scared of wasps, what pesticide can use on them?

Completion:

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
pyrethrum


In [8]:
    # let's test for catastrophic forgetting
    # move the model to cpu for inference
    #original_model.to('cpu')
    raw_prompt = "How long would a train from London to Paris take?"

    prompt = f"""
    Complete the following prompt.

    {raw_prompt}

    Completion:
    """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=100, num_beams=1))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    print(dash_line)
    print(f'ORIGINAL MODEL:\n{original_model_text_output}')



---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Approximately 3.5 hours


In [9]:
## function to tokenize the prompts from H4rmony and wrap them on instructions
def tokenize_function(example):

    # wrap prompts
    start_prompt = 'Complete the following prompt.\n\n'
    end_prompt = '\n\nCompletion: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["Prompt"]]

    # tokenize prompt an completion  (BetterCompletion is the preferred answer)
    example['input_ids'] = tokenizer(prompt, padding='longest', truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["BetterCompletion"], padding='longest', truncation=True, return_tensors="pt").input_ids

    return example

# The tokenize_function code is handling all data across all splits, in batches.
tokenized_dataset = filtered_dataset_dict.map(tokenize_function, batched=True)

# Remove all columns, leaving only inputs_ids (tokenized prompts) and labels (tokenized completions)

cols_to_keep = ['input_ids','labels']

for split in tokenized_dataset.keys():
    tokenized_dataset[split] = tokenized_dataset[split].remove_columns([col for col in tokenized_dataset[split].column_names if col not in cols_to_keep])


Map:   0%|          | 0/413 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [10]:
# check the structure tokenized_dataset
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 413
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 104
    })
})

In [11]:
# check the shapes to make sure
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_dataset['train'].shape}")
print(f"Validation: {tokenized_dataset['test'].shape}")


Shapes of the datasets:
Training: (413, 2)
Validation: (104, 2)


In [12]:
#Setup the PEFT/LoRA model for Fine-Tuning

from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [13]:
#Add LoRA adapter layers/parameters to the original LLM to be trained.
peft_model = get_peft_model(original_model,
                            lora_config)


In [14]:
output_dir = f'./peft-H4rmony-training-{str(int(time.time()))}'

# hyperparameters
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=10,
    logging_steps=1,
    max_steps=1000
)

# instantiate trainer
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_dataset["train"],
)

In [15]:
# trainer - might take long time to run depending on parameters

peft_trainer.train()

peft_model_path="./peft-H4rmony-checkpoint"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Step,Training Loss
1,30.25
2,30.125
3,28.0
4,27.25
5,26.0
6,24.125
7,22.5
8,20.0
9,18.375
10,17.875


('./peft-H4rmony-checkpoint/tokenizer_config.json',
 './peft-H4rmony-checkpoint/special_tokens_map.json',
 './peft-H4rmony-checkpoint/tokenizer.json')

In [16]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       peft_model_path,
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False) # for inference


In [17]:
# reload base model as original
original_T5 = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

In [18]:

raw_prompt = "I'm scared of wasps, what pesticide can use on them?"

prompt = f"""
Complete the following prompt:

{raw_prompt}

Completion: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids


original_model_outputs = original_T5.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')

print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Benzene
---------------------------------------------------------------------------------------------------
PEFT MODEL: Pesticides are chemicals that can be used to kill or control pests. Pesticides are chemicals that can harm or harm people.


In [19]:
# check for catastrophic forgetting

raw_prompt = "How long would a train from London to Paris take?"

prompt = f"""
Complete the following prompt:

{raw_prompt}

Completion: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids


original_model_outputs = original_T5.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)


peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)


print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)

print(f'PEFT MODEL: {peft_model_text_output}')

ORIGINAL MODEL:
Approximately 3.5 hours
---------------------------------------------------------------------------------------------------
PEFT MODEL: The train travels approximately 3.5 hours, so it's a good option if you're travelling infrequently.


In [20]:
raw_prompts = filtered_dataset_dict['test'][0:30]['Prompt']
human_baseline_completions = filtered_dataset_dict['test'][0:30]['BetterCompletion']
original_model_completions = []

peft_model_completions = []

for idx, raw_prompt in enumerate(raw_prompts):
    prompt = f"""
Complete the following prompt:

{raw_prompt}

Completion: """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    human_baseline_text_output = human_baseline_completions[idx]

    original_model_outputs = original_T5.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_completions.append(original_model_text_output)
    peft_model_completions.append(peft_model_text_output)

zipped_completions = list(zip(raw_prompts,human_baseline_completions, original_model_completions,  peft_model_completions))

df = pd.DataFrame(zipped_completions, columns = ['Prompt','human_baseline_completions', 'original_model_completions', 'peft_model_completions'])
df.to_csv('CompletionCompare.csv')

In [21]:
# Compute score for the subset of the data.
bertscore = evaluate.load('bertscore')

original_model_results = bertscore.compute(
    predictions=original_model_completions[0:len(peft_model_completions)],
    references=human_baseline_completions[0:len(peft_model_completions)], lang='en',
)



peft_model_results = bertscore.compute(
    predictions=peft_model_completions[0:len(peft_model_completions)],
    references=human_baseline_completions[0:len(peft_model_completions)], lang='en'

)

print('ORIGINAL MODEL PRECISION, RECALL:')
print(np.mean(original_model_results['precision']),np.mean(original_model_results['recall']))
print('PEFT MODEL PRECISION, RECALL:')
print(np.mean(peft_model_results['precision']),np.mean(peft_model_results['recall']))

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

(…)o/roberta-large/resolve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

(…)co/roberta-large/resolve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

(…)co/roberta-large/resolve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

ORIGINAL MODEL PRECISION, RECALL:
0.8698975304762523 0.8518041054407756
PEFT MODEL PRECISION, RECALL:
0.8954693337281545 0.880770468711853


In [22]:
print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL\n")


precision_improvement = np.array(peft_model_results['precision']) / np.array(original_model_results['precision'])
recall_improvement = np.array(peft_model_results['recall']) / np.array(original_model_results['recall'])

print(f'% Precision Improvement {(np.mean(precision_improvement)-1)*100:.2f}%')
print(f'% Recall Improvement {(np.mean(recall_improvement)-1)*100:.2f}%')

Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL

% Precision Improvement 3.28%
% Recall Improvement 3.42%


In [25]:
# check for catastrophic forgetting

raw_prompt = "Tell me about bullfighting"

prompt = f"""
Complete the following prompt:

{raw_prompt}

Completion: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids


original_model_outputs = original_T5.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)


peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)


print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)

print(f'PEFT MODEL: {peft_model_text_output}')



ORIGINAL MODEL:
Bullfighting is a sport in which bulls are thrown at a bullring, and the bulls are dragged down the bullring by a bullfighter.
---------------------------------------------------------------------------------------------------
PEFT MODEL: Bullfighting is a sport that takes place in arenas, where bulls are fought for the enjoyment of spectators.
