# RLHF Pipeline for Text Generation

Training pipeline for Reinforcement Learning with Human Feedback for text generation NLP models.

## Initializations

In [1]:
import os
import sys

In [2]:
# Run this only for colab
from google.colab import drive

drive.mount("/content/drive")
ROOT_PATH = "/content/drive/MyDrive/project-m3-chatmgl/src"
print(os.listdir(ROOT_PATH))

sys.path.append(ROOT_PATH)
os.chdir(ROOT_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['Fine_tuning_BART.ipynb', 'gen_script_chatMGL.py', 'evaluation.ipynb', 'generative_model.py', 'Fine_tuning_GPT2.ipynb', 'dataset', '__pycache__', 'utils', 'wandb', 'checkpoints', 'gpt2_finetuned_large-1-epoch', 'gpt_2_large_rlhf_step_20', 'reward_model.py', 'RLHF_pipeline.ipynb']


In [3]:
%pip install transformers trl wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import wandb
#key c0ad2861497032d1da1bbcf5fc66f85e197027f2

wandb.init()

[34m[1mwandb[0m: Currently logged in as: [33mjohnbantzis[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
import torch
import torch.utils.data

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_constant_schedule_with_warmup
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import random

import numpy as np
import pandas as pd

import json
import gc

torch.backends.cudnn.deterministic = True

%load_ext autoreload
%autoreload 2

SEED = 42

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("Hardware: GPU (cuda)")
else:
    print("Hardware: CPU")

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

print("Working dir:",os.getcwd())

Hardware: GPU (cuda)
Working dir: /content/drive/MyDrive/project-m3-chatmgl/src


## Data Loading


In [6]:
from dataset.RLHF_dataset import RLHFDataset

TRAIN_DATA_PATH = "../dataset/RLHF/train.json"
TEST_DATA_PATH = "../dataset/RLHF/test.json"
VAL_DATA_PATH = "../dataset/RLHF/val.json"

MAX_SEQ_LEN = 1024
MODEL_NAME = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,padding_side = 'left')
tokenizer.pad_token = tokenizer.eos_token

train_dataset = RLHFDataset(TRAIN_DATA_PATH, tokenizer, MAX_SEQ_LEN)
test_dataset = RLHFDataset(TEST_DATA_PATH, tokenizer, MAX_SEQ_LEN)
val_dataset = RLHFDataset(VAL_DATA_PATH, tokenizer, MAX_SEQ_LEN)

print(f"Train samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

Train samples: 5841
Test samples: 2808
Validation samples: 681


In [7]:
TRAIN_DATA_PATH = "../dataset/supervised_new/train.json"
TEST_DATA_PATH = "../dataset/supervised_new/test.json"
VAL_DATA_PATH = "../dataset/supervised_new/val.json"

MAX_SEQ_LEN = 1024
MODEL_NAME = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,padding_side = 'left')
tokenizer.pad_token = tokenizer.eos_token

train_dataset_rlhf = RLHFDataset(TRAIN_DATA_PATH, tokenizer, MAX_SEQ_LEN)
test__rlhf = RLHFDataset(TEST_DATA_PATH, tokenizer, MAX_SEQ_LEN)
val_dataset_rlhf = RLHFDataset(VAL_DATA_PATH, tokenizer, MAX_SEQ_LEN)

print(f"Train samples: {len(train_dataset_rlhf)}")
print(f"Test samples: {len(test__rlhf)}")
print(f"Validation samples: {len(val_dataset_rlhf)}")

Train samples: 4241
Test samples: 2609
Validation samples: 482


### Loading Reward model

In [8]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
%pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
from reward_model import *

# Load config
reward_config = ClassificationRewardModelConfig()

# This by default loads the GPT2 model. To change it, change the hardcoded path inside the class.
gpt2_reward_model = ClassificationRewardModel(reward_config,"/content/drive/MyDrive/project-m3-chatmgl/models/reward_model/distilbert/lr5e-05-warmup0.2").to(DEVICE)

In [10]:
gpt2_reward_model.get_reward('Human: In which brain region are the boundaries between brain area the clearest?In the frontal lobe, Near the hypothalamus, In the parietal lobe, Close to the early sensory areas, \n\n Assistant:  No idea man.\n\n')

tensor(-0.2321, device='cuda:0')

## Supervised Fine Tuning

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments,AutoModelForCausalLM, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


training_args = TrainingArguments(
        output_dir='./checkpoints',
        dataloader_drop_last=True,
        evaluation_strategy="steps",
        eval_steps=500,
        save_steps=1000,
        logging_steps=1,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        learning_rate=1e-5,
        warmup_steps=100,
        num_train_epochs = 1,
        gradient_accumulation_steps=1,
        gradient_checkpointing=not False,
        run_name="gpt2-finetuned",
        report_to="wandb",
        ddp_find_unused_parameters=False
    )

model = AutoModelForCausalLM.from_pretrained(
        'gpt2-large'
    )

trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        packing=True,
        data_collator = data_collator
    )

Using pad_token, but it is not set yet.


In [None]:
trainer.train()



Step,Training Loss,Validation Loss
500,1.7872,1.815835
1000,1.7165,1.768158


TrainOutput(global_step=1460, training_loss=1.9919853392937412, metrics={'train_runtime': 3438.1125, 'train_samples_per_second': 1.699, 'train_steps_per_second': 0.425, 'total_flos': 2.5417727606784e+16, 'train_loss': 1.9919853392937412, 'epoch': 1.0})

In [None]:
model.save_pretrained('gpt2_finetuned_large-1-epoch')
tokenizer.save_pretrained('gpt2_finetuned_large-1-epoch')

('gpt2_finetuned_large-1-epoch/tokenizer_config.json',
 'gpt2_finetuned_large-1-epoch/special_tokens_map.json',
 'gpt2_finetuned_large-1-epoch/vocab.json',
 'gpt2_finetuned_large-1-epoch/merges.txt',
 'gpt2_finetuned_large-1-epoch/added_tokens.json',
 'gpt2_finetuned_large-1-epoch/tokenizer.json')

## PPO Model Initialization

In [11]:
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

config = PPOConfig(
    model_name="gpt2_finetuned_large-1-epoch",
    batch_size = 4,
    log_with="wandb",
    learning_rate=5e-7
)

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 4}

In [12]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name, padding_side = 'left')

tokenizer.pad_token = tokenizer.eos_token

In [13]:
len(train_dataset_rlhf)

4241

In [14]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=train_dataset_rlhf)

VBox(children=(Label(value='0.002 MB of 0.012 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.173637…

## Human Feedback Training

In [33]:
torch.cuda.empty_cache()

In [15]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "batch_size":1
}

output_min_length = 200
output_max_length = 300
output_length_sampler = LengthSampler(output_min_length, output_max_length)


train_sampler = RandomSampler(train_dataset_rlhf)
train_dataloader = DataLoader(
        train_dataset_rlhf,
        sampler=train_sampler,
        batch_size=4,
    )

i=0
for batch in tqdm(train_dataloader):
    batch['query'] = batch['question']
    query_tensors = [batch["question_input_ids"][i][0][512:] for i in range(4)]


    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query.to(DEVICE), **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    texts = ['Human:' + q + '\n\n Assistant:' + r + '\n\n' for q, r in zip(batch["question"], batch["response"])]
    rewards = [2*gpt2_reward_model.get_reward(text) for text in texts]
    #### Run PPO step
    i +=1

    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    ###Save model
    # if i%20 == 0:
    #   ppo_trainer.save_pretrained('gpt_2_large_rlhf_step_{}'.format(i))


  0%|          | 0/1061 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  9%|▉         | 95/1061 [1:17:00<13:03:05, 48.64s/it]


In [None]:
ppo_trainer.save_pretrained('gpt_2_large_rlhf_step_{}'.format(i))