## Setup

In [2]:
!pip install transformers trl wandb einops

[0m

In [None]:
# Only on first run.

import os
from accelerate.utils import write_basic_config

write_basic_config()  # Write a config file
os._exit(00)  # Restart the notebook

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import torch
import os

os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '29500'

torch.distributed.init_process_group(backend='nccl', world_size=1, rank=0)

### Configuration

In [2]:
from trl import PPOConfig

config = PPOConfig(
            model_name="susnato/phi-1_5_dev",
            reward_model="text-classification:Hello-SimpleAI/chatgpt-detector-roberta",
            learning_rate=1.41e-5,
            log_with="wandb", 
            mini_batch_size=8, # really important for memory
            batch_size=128,
            gradient_accumulation_steps=1,
            early_stopping=False,
            target_kl=6.0,
            kl_penalty="kl",
            seed=0,
            use_score_scaling=False,
            use_score_norm=False,
            score_clip=None,
        )

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16, "max_length": 512, "truncation": True}

  from .autonotebook import tqdm as notebook_tqdm


## Load data and models

In [85]:
from transformers import AutoTokenizer
from datasets import load_dataset

def build_dataset(config, dataset_name="LDJnr/Pure-Dove", max_length=300):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    ds = load_dataset(dataset_name, split="train")
    querys = [ds_item.get('conversation')[0].get('input') for ds_item in ds]
    ds = ds.add_column('query', querys)

    def prepare_dataset(ds_item):
        ds_item['query'] += '. Give detailed and long answers.'
        ds_item["query_ids"] = tokenizer.encode(ds_item["query"], padding='max_length', max_length=max_length)
        return ds_item
    
    ds = ds.map(prepare_dataset, batched=False)
    ds = ds.filter(lambda x: len(x["query_ids"]) <= max_length, batched=False)
    ds = ds.remove_columns(['source', 'conversation'])
    ds.set_format(type="torch")
    
    return ds

In [86]:
dataset = build_dataset(config)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [98]:
dict((key, [d[key] for d in dataset]) for key in dataset[0])

{'query': ["explain Tuckman's stages of group development. Give detailed and long answers.",
  'Write an angry rant as Cave Johnson about all the best guys being straight. Give detailed and long answers.',
  "Assume we have a band-limited signal and an ideal sampling function (Dirac Comb) with frequency twice that of the band-limited signal. We know our signal is band-limited because we filtered it with an ideal brick-wall filter with a pass-band equal to the band-width of the signal; thus the signal is strictly band-limited. Using LaTeX to beautify your mathematical expressions, what is the spectrum of the sampled signal from a purely mathematical point of view? I'm not interested in practical or real scenarios.. Give detailed and long answers.",
  'How is the success of a chief underwriter measured within an insurance company? What quantitative metrics might they be trying to optimize for?. Give detailed and long answers.',
  'I want you to combine your knowledge about performing a r

In [70]:
from transformers import default_data_collator

default_data_collator(dataset)

{'query_ids': tensor([[20676,   391,   309,  ..., 50256, 50256, 50256],
         [16594,   281,  7954,  ..., 50256, 50256, 50256],
         [ 8021,  2454,   356,  ..., 50256, 50256, 50256],
         ...,
         [ 2061,   561,  1645,  ..., 50256, 50256, 50256],
         [16594,   257,  1621,  ..., 50256, 50256, 50256],
         [ 7266, 22766,   287,  ..., 50256, 50256, 50256]])}

### Load pre-trained models

In [6]:
import torch
from transformers import AutoTokenizer
from trl import AutoModelForCausalLMWithValueHead

model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name, trust_remote_code=True, torch_dtype=torch.bfloat16)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name, trust_remote_code=True, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(config.model_name, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
inputs = tokenizer('''
    Question: Write an English essay with at least 200 words.
    Answer:
''', return_tensors="pt", return_attention_mask=False)

In [8]:
# test = model.generate(**inputs, **generation_kwargs)
test = model(**inputs)

KeyboardInterrupt: 

In [None]:
tokenizer.batch_decode(test)

### Initialize PPOTrainer
The `PPOTrainer` takes care of device placement and optimization later on:

In [87]:
from trl import PPOTrainer
from transformers import default_data_collator

ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator) # data_collator=collator

### Load BERT classifier
We load a BERT classifier fine-tuned on the IMDB dataset.

In [10]:
import torch
from transformers import AutoTokenizer, pipeline
from trl.import_utils import is_xpu_available

device = ppo_trainer.accelerator.device
task, model_name = config.reward_model.split(":")

device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

task, model_name = config.reward_model.split(":")
classifier_pipe = pipeline(task, model=model_name, device=device)

In [11]:
# Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
if classifier_pipe.tokenizer.pad_token_id is None:
    classifier_pipe.tokenizer.pad_token_id = tokenizer.pad_token_id

if classifier_pipe.model.config.pad_token_id is None:
    classifier_pipe.model.config.pad_token_id = tokenizer.pad_token_id

The model outputs are the logits for the negative and positive class. We will use the logits for positive class as a reward signal for the language model.

In [None]:
text = "this movie was really bad!!"
test = classifier_pipe(text, **sent_kwargs)

In [None]:
test

In [None]:
text = "this movie was really good!!"
classifier_pipe(text, **sent_kwargs)

### Generation settings
For the response generation we just use sampling and make sure top-k and nucleus sampling are turned off as well as a minimal length.

In [12]:
generation_kwargs = {
    "min_length": 128,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 256,
}

In [88]:
batch = next(iter(ppo_trainer.dataloader))
query_tensors = batch["query_ids"]

KeyError: 'query_ids'

In [100]:
batch.keys()

dict_keys(['query'])

In [None]:
# Get response from gpt2
response_tensors, ref_response_tensors = ppo_trainer.generate(
    query_tensors, return_prompt=False, generate_ref_response=True, **generation_kwargs
)
batch["response"] = tokenizer.batch_decode(response_tensors)
batch["ref_response"] = tokenizer.batch_decode(ref_response_tensors)

In [None]:
# Compute sentiment score
texts = [q + r for q, r in zip(batch["query"], batch["response"])]
pipe_outputs = classifier_pipe(texts, **sent_kwargs)
rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs]
ref_texts = [q + r for q, r in zip(batch["query"], batch["ref_response"])]
ref_pipe_outputs = classifier_pipe(ref_texts, **sent_kwargs)
ref_rewards = [torch.tensor(output[0]["score"]) for output in ref_pipe_outputs]
batch["ref_rewards"] = ref_rewards

In [None]:
# Run PPO step
stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "ref_response", "ref_rewards"])

## Optimize model

### Training loop

The training loop consists of the following main steps:
1. Get the query responses from the policy network (GPT-2)
2. Get sentiments for query/responses from BERT
3. Optimize policy with PPO using the (query, response, reward) triplet

**Training time**

This step takes **~2h** on a V100 GPU with the above specified settings.

In [None]:
import torch
from tqdm import tqdm
tqdm.pandas()

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    # Get response from gpt2
    response_tensors, ref_response_tensors = ppo_trainer.generate(
        query_tensors, return_prompt=False, generate_ref_response=True, **generation_kwargs
    )
    batch["response"] = tokenizer.batch_decode(response_tensors)
    batch["ref_response"] = tokenizer.batch_decode(ref_response_tensors)

    # Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = classifier_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs]
    ref_texts = [q + r for q, r in zip(batch["query"], batch["ref_response"])]
    ref_pipe_outputs = classifier_pipe(ref_texts, **sent_kwargs)
    ref_rewards = [torch.tensor(output[0]["score"]) for output in ref_pipe_outputs]
    batch["ref_rewards"] = ref_rewards

    # Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "ref_response", "ref_rewards"])

### Training progress
If you are tracking the training progress with Weights&Biases you should see a plot similar to the one below. Check out the interactive sample report on wandb.ai: [link](https://app.wandb.ai/huggingface/trl-showcase/runs/1jtvxb1m/).

<div style="text-align: center">
<img src='https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/gpt2_tuning_progress.png' width='800'>
<p style="text-align: center;"> <b>Figure:</b> Reward mean and distribution evolution during training. </p>
</div>

One can observe how the model starts to generate more positive outputs after a few optimisation steps.

> Note: Investigating the KL-divergence will probably show that at this point the model has not converged to the target KL-divergence, yet. To get there would require longer training or starting with a higher initial coefficient.

## Model inspection
Let's inspect some examples from the IMDB dataset. We can use `model_ref` to compare the tuned model `model` against the model before optimisation.

In [None]:
import pandas as pd

#### get a batch from the dataset
bs = 16
game_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
game_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()

response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    gen_len = output_length_sampler()
    output = ref_model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors_ref.append(output)
    output = model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors.append(output)

#### decode responses
game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
game_data["response (after)"] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q, r in zip(game_data["query"], game_data["response (before)"])]
game_data["rewards (before)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

texts = [q + r for q, r in zip(game_data["query"], game_data["response (after)"])]
game_data["rewards (after)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results

Looking at the reward mean/median of the generated sequences we observe a significant difference.

In [None]:
print("mean:")
display(df_results[["rewards (before)", "rewards (after)"]].mean())
print()
print("median:")
display(df_results[["rewards (before)", "rewards (after)"]].median())

## Save model
Finally, we save the model and push it to the Hugging Face for later usage.

In [None]:
model.save_pretrained("gpt2-imdb-pos-v2", push_to_hub=True)
tokenizer.save_pretrained("gpt2-imdb-pos-v2", push_to_hub=True)