<a href="https://colab.research.google.com/github/GeulHae/Analysis/blob/dev_dataAnalysis/TRL_%2B_GPT2%EA%B0%80_%ED%8F%AC%ED%95%A8%EB%90%9C_RLHF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TRL + GPT2가 포함된 RLHF

In [None]:
!pip install git+https://github.com/lvwerra/trl

In [None]:
import torch
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from trl.core import respond_to_batch

config = {
    "model_name": "gpt2",
    "steps": 20000,
    "batch_size": 256,
    "forward_batch_size": 16,
    "ppo_epochs": 4,
#     "txt_in_min_len": 2,
#     "txt_in_max_len": 8,
#     "txt_out_min_len": 4,
#     "txt_out_max_len": 16,
#    "lr": 1.41e-5,
    "init_kl_coef":0.2,
    "target": 6,
    "horizon":10000,
    "gamma":1,
    "lam":0.95,
    "cliprange": .2,
    "cliprange_value":.2,
    "vf_coef":.1,
}
# initialize trainer
ppo_config = PPOConfig(**config)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# get models
model = AutoModelForCausalLMWithValueHead.from_pretrained(config['model_name'])
model_ref = create_reference_model(model)

model.to(device)
model_ref.to(device)

tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

In [None]:
tokenizer.pad_token = tokenizer.eos_token

# encode a query
query_txt = ["What's your job?"]
query_tensor = tokenizer(query_txt, return_tensors="pt", padding=True)['input_ids']
query_tensor = query_tensor.to(device)
query_list = [query_tensor[0]]*256

# get model response
response_tensor  = respond_to_batch(model_ref, query_tensor)

In [None]:
# dummy response
response_txt = ["I'm the mailman"]
response_tensor = tokenizer(response_txt, return_tensors="pt", padding=True)['input_ids']
response_tensor = response_tensor.to(device)
response_list = [response_tensor[0]]*256

In [None]:
# create a ppo trainer
ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer)
device = ppo_trainer.accelerator.device

# define a reward for response
# (this could be any reward such as human feedback or output from another model)
reward_list = [torch.tensor(10)] * 256

# train model for one step with ppo
train_stats = ppo_trainer.step(query_list, response_list, reward_list)

In [None]:
test_input = tokenizer('What is your job?', return_tensors='pt').to(device).input_ids

response_tensor  = respond_to_batch(model, test_input)
tokenizer.batch_decode(response_tensor)