# Reinforcement learning to detoxify fine tuned instruct google/flan-T5 peft model to align with human values.

This is continuation of previous notebook, due to lack of GPU resources task(text summarization) is splitted across two notebooks.

Objective is to build reward and PPO model to detoxifying fine tuned peft model.

Used Meta AI roberta hate speech model to evaluate toxicity, based on toxicity value PPO model will update RL policy.

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, LoraConfig, TaskType

# trl: Transformer Reinforcement Learning
from trl import PPOConfig, PPOTrainer, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate
import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
model_name='google/flan-t5-base'
huggingface_dataset_name="knkarthick/dialogsum"
dataset_original=load_dataset(huggingface_dataset_name)

Downloading readme:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading and preparing dataset csv/knkarthick--dialogsum to /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
def print_number_of_trainable_model_parameters(model):
    " Function returns total model parameters and trainable parameters."
    
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"\ntrainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [5]:
# data preprocessing: 

device="cuda:0" if torch.cuda.is_available() else "cpu"

def build_dataset(model_name, dataset_name, input_min_text_length, input_max_text_length):
    "Preprocess the dataset and split it into train and test parts"
    
    dataset = load_dataset(dataset_name, split="train")
    
    # Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
    dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)

    # Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map=device)
    
    def tokenize(sample):
        
        # Wrap each dialogue with the instruction.
        prompt = f"""
                 Summarize the following conversation.

                 {sample["dialogue"]}

                 Summary:
                 """
        
        sample["input_ids"] = tokenizer.encode(prompt)
        
        # This must be called "query", which is a requirement of our PPO library.
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # Tokenize each dialogue.
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")
    
    # Split the dataset into train and test parts.
    dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

    return dataset_splits

dataset = build_dataset(model_name=model_name, dataset_name=huggingface_dataset_name, input_min_text_length=200, input_max_text_length=1000)

print(dataset)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/10022 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 8017
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 2005
    })
})


# PPO model:

In [6]:
# Create PPO model :

lora_config = LoraConfig(r=32, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type=TaskType.SEQ_2_SEQ_LM)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
peft_model = PeftModel.from_pretrained(model, '/kaggle/input/peft-model/peft_model', lora_config=lora_config, torch_dtype=torch.bfloat16, device_map=device, is_trainable=True)
peft_model.to(device)
print(f'PEFT model parameters to be updated:\n{print_number_of_trainable_model_parameters(peft_model)}\n')

ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model, torch_dtype=torch.bfloat16, device_map=device, is_trainable=True)                                                             
print(f'PPO model parameters to be updated (ValueHead + 769 params):\n{print_number_of_trainable_model_parameters(ppo_model)}\n')
print(ppo_model.v_head)

# Creating reference PPO model to measure toxicity before and after detoxification.
ref_model = create_reference_model(ppo_model)
ref_model.to(device)
print(f'Reference model parameters to be updated:\n{print_number_of_trainable_model_parameters(ref_model)}\n')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

PEFT model parameters to be updated:

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%

PPO model parameters to be updated (ValueHead + 769 params):

trainable model parameters: 3539713
all model parameters: 251117569
percentage of trainable model parameters: 1.41%

ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=768, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)
Reference model parameters to be updated:

trainable model parameters: 0
all model parameters: 251117569
percentage of trainable model parameters: 0.00%



# Reward Model:

In [7]:
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map=device)
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name, device_map="auto")

toxicity_evaluator = evaluate.load("toxicity", toxicity_model_name, module_type="measurement", toxic_label="hate")

def evaluate_toxicity(model, toxicity_evaluator, tokenizer, dataset, num_samples):
    "Function evaluate toxicity of model completions"
    
    max_new_tokens=100

    toxicities = []
    input_texts = []
    for i, sample in tqdm(enumerate(dataset)):
        input_text = sample["query"]

        if i > num_samples:
            break
            
        input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids.to(device)
        
        generation_config = GenerationConfig(max_new_tokens=max_new_tokens, top_k=0.0, top_p=1.0, do_sample=True)

        response_token_ids = model.generate(input_ids=input_ids, generation_config=generation_config)
        
        generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens=True)
        
        toxicity_score = toxicity_evaluator.compute(predictions=[(input_text + " " + generated_text)])

        toxicities.extend(toxicity_score["toxicity"])

    # Compute mean & std using np.
    mean = np.mean(toxicities)
    std = np.std(toxicities)
        
    return mean, std


tokenizer = AutoTokenizer.from_pretrained(model_name, device_map=device)

mean_before_detoxification, std_before_detoxification = evaluate_toxicity(model=ref_model, toxicity_evaluator=toxicity_evaluator, tokenizer=tokenizer, dataset=dataset["test"], num_samples=10)
                                                                          
print(f'toxicity [mean, std] before detox: [{mean_before_detoxification}, {std_before_detoxification}]')

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.08k [00:00<?, ?B/s]

11it [00:19,  1.78s/it]

toxicity [mean, std] before detox: [0.04130103446500884, 0.045098608991112565]





# Update RL policy using PPO:

In [8]:
# RL policy updation using PPO:

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

learning_rate=1.41e-5
max_ppo_epochs=1
mini_batch_size=4
batch_size=16

config = PPOConfig(model_name=model_name, learning_rate=learning_rate, ppo_epochs=max_ppo_epochs, mini_batch_size=mini_batch_size, batch_size=batch_size)
ppo_trainer = PPOTrainer(config=config, model=ppo_model, ref_model=ref_model, tokenizer=tokenizer, dataset=dataset["train"], data_collator=collator)

output_min_length = 100
output_max_length = 400
output_length_sampler = LengthSampler(output_min_length, output_max_length)

not_hate_index=0
sentiment_pipe = pipeline("sentiment-analysis",  model=toxicity_model_name)

generation_kwargs = {"min_length": 5, "top_k": 0.0, "top_p": 1.0, "do_sample": True}
  
reward_kwargs = {"top_k": None, "function_to_apply": "none", "batch_size": 16}

max_ppo_steps = 10

for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    # Break when you reach max_steps.
    if step >= max_ppo_steps:
        break   
    
    prompt_tensors = batch["input_ids"]

    # Get response from FLAN-T5/PEFT LLM.
    summary_tensors = []

    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()        
        generation_kwargs["max_new_tokens"] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)
        summary_tensors.append(summary.squeeze()[-max_new_tokens:])
        
    # This needs to be called "response".
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

    # Compute reward outputs.
    query_response_pairs = [q + r for q, r in zip(batch["query"], batch["response"])]    
    rewards = sentiment_pipe(query_response_pairs, **reward_kwargs)

    # You use the `nothate` item because this is the score for the positive `nothate` class.
    reward_tensors = [torch.tensor(reward[not_hate_index]["score"]) for reward in rewards]    

    # Run PPO step.
    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)
    
    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print('-'.join('' for x in range(100)))
    
    
mean_after_detoxification, std_after_detoxification = evaluate_toxicity(model=ppo_model, toxicity_evaluator=toxicity_evaluator, tokenizer=tokenizer, dataset=dataset["test"], num_samples=100)
                                              
print(f'toxicity [mean, std] after detox: [{mean_after_detoxification}, {std_after_detoxification}]')

0it [00:00, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
1it [00:29, 29.72s/it]

objective/kl: 29.292028427124023
ppo/returns/mean: -0.5290176868438721
ppo/policy/advantages_mean: 2.507438701115916e-09
---------------------------------------------------------------------------------------------------


2it [01:03, 32.15s/it]

objective/kl: 30.621055603027344
ppo/returns/mean: -0.5418417453765869
ppo/policy/advantages_mean: -9.766496944507708e-09
---------------------------------------------------------------------------------------------------


3it [01:40, 34.47s/it]

objective/kl: 32.923484802246094
ppo/returns/mean: -0.7347954511642456
ppo/policy/advantages_mean: 9.122857136389939e-09
---------------------------------------------------------------------------------------------------


4it [02:14, 34.32s/it]

objective/kl: 28.05332374572754
ppo/returns/mean: -0.37403279542922974
ppo/policy/advantages_mean: 9.219272012472857e-10
---------------------------------------------------------------------------------------------------


5it [02:48, 33.95s/it]

objective/kl: 31.806900024414062
ppo/returns/mean: -0.6588640213012695
ppo/policy/advantages_mean: 1.4863599240300118e-09
---------------------------------------------------------------------------------------------------


6it [03:19, 33.18s/it]

objective/kl: 24.307518005371094
ppo/returns/mean: -0.18032237887382507
ppo/policy/advantages_mean: -5.815087078531178e-09
---------------------------------------------------------------------------------------------------


7it [03:54, 33.59s/it]

objective/kl: 28.29378890991211
ppo/returns/mean: -0.5968725085258484
ppo/policy/advantages_mean: -5.079876963520746e-09
---------------------------------------------------------------------------------------------------


8it [04:27, 33.42s/it]

objective/kl: 28.12511444091797
ppo/returns/mean: -0.36000287532806396
ppo/policy/advantages_mean: 8.678624041635885e-09
---------------------------------------------------------------------------------------------------


9it [05:07, 35.66s/it]

objective/kl: 27.942882537841797
ppo/returns/mean: -0.5314969420433044
ppo/policy/advantages_mean: -7.167169968624876e-09
---------------------------------------------------------------------------------------------------


10it [05:40, 34.09s/it]


objective/kl: 23.903675079345703
ppo/returns/mean: -0.33554959297180176
ppo/policy/advantages_mean: 1.0602102307188943e-08
---------------------------------------------------------------------------------------------------


101it [02:24,  1.43s/it]

toxicity [mean, std] after detox: [0.023523962623837387, 0.03528463366608066]





# Conclusion:

Before detoxification mean and std was: [0.04130103446500884, 0.045098608991112565]

After detoxification mean and std is: [0.023523962623837387, 0.03528463366608066]