## Setup

In [1]:
import gc
import torch

torch.cuda.empty_cache()
gc.collect()

0

In [None]:
# einops: phi & qwen
# sentencepiece: llama
# tiktoken & transformers_stream_generator: qwen
!pip install transformers trl wandb einops sentencepiece tiktoken transformers_stream_generator==0.0.4

In [None]:
# when using quantized models

!pip install auto-gptq optimum

In [None]:
# Only on first run.

import os
from accelerate.utils import write_basic_config

write_basic_config()  # Write a config file
os._exit(00)  # Restart the notebook

In [1]:
import torch
import os

os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '29500'

torch.distributed.init_process_group(backend='nccl', world_size=1, rank=0)

### Configuration

In [2]:
from trl import PPOConfig

config = PPOConfig(
            model_name="meta-llama/Llama-2-7b-chat-hf",
            reward_model="text-classification:Hello-SimpleAI/chatgpt-detector-roberta",
            learning_rate=1.41e-5,
            log_with="wandb", 
            mini_batch_size=8, # really important for memory
            batch_size=128, 
            gradient_accumulation_steps=1,
            early_stopping=False,
            target_kl=6.0,
            kl_penalty="kl",
            seed=0,
            use_score_scaling=False,
            use_score_norm=False,
            score_clip=None,
        )

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16, "max_length": 512, "truncation": True}

  from .autonotebook import tqdm as notebook_tqdm


## Load data and models

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("LlamaTokenizerFast")

  from .autonotebook import tqdm as notebook_tqdm


OSError: LlamaTokenizerFast is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [6]:
from transformers import LlamaTokenizer, AutoTokenizer

# tokenizer = LlamaTokenizer.from_pretrained(config.model_name, trust_remote_code=True, padding_side='left')
tokenizer = AutoTokenizer.from_pretrained(config.model_name,
                                          padding_side='left',
                                          trust_remote_code=True) 
if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token

In [7]:
from dataset import build_dataset
# from dataset import build_dataset_for_gemma

dataset = build_dataset(tokenizer)
# dataset = build_dataset_for_gemma(tokenizer)
    
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Map: 100%|█████████████████████████████████████████████████████████████████| 3857/3857 [00:02<00:00, 1587.90 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████| 3857/3857 [00:00<00:00, 6033.86 examples/s]


In [10]:
import torch
inputs = dataset[0]['input_ids'].to('cuda')
test1 = model.generate(inputs, **generation_kwargs)  
test2 = model(inputs)

In [11]:
tokenizer.batch_decode(test1, skip_special_tokens=True)

["[INST] <<SYS>>\nYou are an assistant who gives detailed and long answers\n<</SYS>>\n\nexplain Tuckman's stages of group development [/INST]  Ah, an excellent topic! Tuckman's stages of group development is a theory that explains the different stages a group goes through as they form and mature. The theory was first proposed by Bruce Tuckman in 1965 and has since been widely accepted and used in various fields. Here are the five stages of group development, along with some key characteristics of each stage:\n\n1. **Forming**: This is the initial stage of group development, where members come together to form a new group. During this stage, the group is characterized by a high level of dependency on the leader, as members rely on the leader to provide direction and guidance. The group may also experience a sense of uncertainty and insecurity, as they try to figure out their roles and responsibilities within the group.\n\nKey characteristics:\n\n* High dependency on the leader\n* Uncert

### Load pre-trained models

In [8]:
import torch
from transformers import AutoTokenizer
from trl import AutoModelForCausalLMWithValueHead
from transformers import GenerationConfig
from accelerate import Accelerator

current_device = Accelerator().local_process_index

model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto",) 
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto",) 

# model.generation_config = GenerationConfig.from_pretrained(config.model_name, 
#                                                            pad_token_id=tokenizer.pad_token_id,
#                                                            top_k=0.0, 
#                                                            top_p=1.0,
#                                                            do_sample=True,
#                                                            max_new_tokens=512,
#                                                            eos_token_id=-1
#                                                           )
# ref_model.generation_config = GenerationConfig.from_pretrained(config.model_name, 
#                                                                pad_token_id=tokenizer.pad_token_id,
#                                                                top_k=0.0, 
#                                                                top_p=1.0,
#                                                                do_sample=True,
#                                                                max_new_tokens=512,
#                                                                eos_token_id=-1
#                                                               )
# model.to('cuda')

config.json: 100%|█████████████████████████████████████████████████████████████████████| 614/614 [00:00<00:00, 4.45MB/s]
model.safetensors.index.json: 100%|█████████████████████████████████████████████████| 26.8k/26.8k [00:00<00:00, 161MB/s]
Downloading shards:   0%|                                                                         | 0/2 [00:00<?, ?it/s]
model-00001-of-00002.safetensors:   0%|                                                     | 0.00/9.98G [00:00<?, ?B/s][A
model-00001-of-00002.safetensors:   0%|                                             | 21.0M/9.98G [00:00<01:05, 152MB/s][A
model-00001-of-00002.safetensors:   0%|▏                                            | 41.9M/9.98G [00:00<01:01, 162MB/s][A
model-00001-of-00002.safetensors:   1%|▎                                            | 62.9M/9.98G [00:00<01:00, 165MB/s][A
model-00001-of-00002.safetensors:   1%|▍                                            | 83.9M/9.98G [00:00<00:59, 167MB/s][A
model-00001-of-00

### Initialize PPOTrainer
The `PPOTrainer` takes care of device placement and optimization later on:

In [12]:
from trl import PPOTrainer
from transformers import default_data_collator

ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /mnt/ceph/storage/data-tmp/current/hj80pahi/.netrc


ValueError: You can't train a model that has been loaded with `device_map='auto'` in any distributed mode. Please rerun your script specifying `--num_processes=1` or by launching with `python {{myscript.py}}`.

### Load classifier

In [None]:
import torch
from transformers import pipeline
from trl.import_utils import is_xpu_available

device = ppo_trainer.accelerator.device
task, model_name = config.reward_model.split(":")

device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

task, model_name = config.reward_model.split(":")
classifier_pipe = pipeline(task, model=model_name, device=device)

In [None]:
# Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
if classifier_pipe.tokenizer.pad_token_id is None:
    classifier_pipe.tokenizer.pad_token_id = tokenizer.pad_token_id

if classifier_pipe.model.config.pad_token_id is None:
    classifier_pipe.model.config.pad_token_id = tokenizer.pad_token_id

The model outputs are the logits for the negative and positive class. We will use the logits for positive class as a reward signal for the language model.

In [None]:
text = "this movie was really bad!!"
test = classifier_pipe(text, **sent_kwargs)

In [None]:
test[0][0]['score']

In [None]:
text = "this movie was really good!!"
classifier_pipe(text, **sent_kwargs)

### Generation settings
For the response generation we just use sampling and make sure top-k and nucleus sampling are turned off as well as a minimal length.

In [9]:
generation_kwargs = {
    # "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": 100_000,
    "max_new_tokens": 512,
}

In [None]:
batch = next(iter(ppo_trainer.dataloader))
# query_tensors = batch["input_ids"]
query_tensors = torch.LongTensor([q.tolist() for q in batch['input_ids']]).to('cuda')

In [None]:
response_tensors = model.generate(
    query_tensors,
    return_dict_in_generate=False,
    generation_config=model.generation_config
)
ref_response_tensors = ref_model.generate(
    query_tensors,
    return_dict_in_generate=False,
    generation_config=ref_model.generation_config
)

In [None]:
from dataset import get_repsonse_from_qwen_batch

batch["response"] = get_repsonse_from_qwen_batch(tokenizer, response_tensors, query_tensors, batch['query'])
batch["ref_response"] = get_repsonse_from_qwen_batch(tokenizer, response_tensors, query_tensors, batch['query'])

In [None]:
# Get response from model
response_tensors, ref_response_tensors = ppo_trainer.generate(
    query_tensors, return_prompt=False, generate_ref_response=True, **model.generation_config.to_dict()
)
batch["response"] = tokenizer.batch_decode(response_tensors)
batch["ref_response"] = tokenizer.batch_decode(ref_response_tensors)

In [None]:
# Compute sentiment score
# texts = [q + r for q, r in zip(batch["query"], batch["response"])]
# pipe_outputs = classifier_pipe(texts, **sent_kwargs)
pipe_outputs = classifier_pipe(batch['response'], **sent_kwargs)
rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs]
# ref_texts = [q + r for q, r in zip(batch["query"], batch["ref_response"])]
# ref_pipe_outputs = classifier_pipe(ref_texts, **sent_kwargs)
ref_pipe_outputs = classifier_pipe(batch['ref_response'], **sent_kwargs)
ref_rewards = [torch.tensor(output[0]["score"]) for output in ref_pipe_outputs]
batch["ref_rewards"] = ref_rewards

In [None]:
total = 0
for r in rewards:
    total += r
print(total/len(rewards))

In [None]:
batch["response"]

In [None]:
response_tensors_list = [rt for rt in response_tensors]

In [None]:
# Run PPO step
stats = ppo_trainer.step(batch["input_ids"], response_tensors_list, rewards) # use batch["input_ids"] instead of query_tensors for weights upate because you need a list here
ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "ref_response", "ref_rewards"])

## Optimize model

### Training loop

The training loop consists of the following main steps:
1. Get the query responses from the policy network (GPT-2)
2. Get sentiments for query/responses from BERT
3. Optimize policy with PPO using the (query, response, reward) triplet

**Training time**

This step takes **~2h** on a V100 GPU with the above specified settings.

In [None]:
import torch
from tqdm import tqdm
tqdm.pandas()

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    # Get response from gpt2
    response_tensors, ref_response_tensors = ppo_trainer.generate(
        query_tensors, return_prompt=False, generate_ref_response=True, **generation_kwargs
    )
    batch["response"] = tokenizer.batch_decode(response_tensors)
    batch["ref_response"] = tokenizer.batch_decode(ref_response_tensors)

    # Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = classifier_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs]
    ref_texts = [q + r for q, r in zip(batch["query"], batch["ref_response"])]
    ref_pipe_outputs = classifier_pipe(ref_texts, **sent_kwargs)
    ref_rewards = [torch.tensor(output[0]["score"]) for output in ref_pipe_outputs]
    batch["ref_rewards"] = ref_rewards

    # Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "ref_response", "ref_rewards"])

### Training progress
If you are tracking the training progress with Weights&Biases you should see a plot similar to the one below. Check out the interactive sample report on wandb.ai: [link](https://app.wandb.ai/huggingface/trl-showcase/runs/1jtvxb1m/).

<div style="text-align: center">
<img src='https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/gpt2_tuning_progress.png' width='800'>
<p style="text-align: center;"> <b>Figure:</b> Reward mean and distribution evolution during training. </p>
</div>

One can observe how the model starts to generate more positive outputs after a few optimisation steps.

> Note: Investigating the KL-divergence will probably show that at this point the model has not converged to the target KL-divergence, yet. To get there would require longer training or starting with a higher initial coefficient.

## Model inspection
Let's inspect some examples from the IMDB dataset. We can use `model_ref` to compare the tuned model `model` against the model before optimisation.

In [None]:
import pandas as pd

#### get a batch from the dataset
bs = 16
game_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
game_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()

response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    gen_len = output_length_sampler()
    output = ref_model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors_ref.append(output)
    output = model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors.append(output)

#### decode responses
game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
game_data["response (after)"] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q, r in zip(game_data["query"], game_data["response (before)"])]
game_data["rewards (before)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

texts = [q + r for q, r in zip(game_data["query"], game_data["response (after)"])]
game_data["rewards (after)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results

Looking at the reward mean/median of the generated sequences we observe a significant difference.

In [None]:
print("mean:")
display(df_results[["rewards (before)", "rewards (after)"]].mean())
print()
print("median:")
display(df_results[["rewards (before)", "rewards (after)"]].median())

## Save model
Finally, we save the model and push it to the Hugging Face for later usage.

In [None]:
model.save_pretrained("gpt2-imdb-pos-v2", push_to_hub=True)
tokenizer.save_pretrained("gpt2-imdb-pos-v2", push_to_hub=True)