In [2]:
!pip install transformers torchmetrics deepspeed nltk datasets wandb accelerate

Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.4


In [None]:
!git clone https://github.com/CarperAI/trlx.git
!git config --global --add safe.directory /content/trlx && cd /content/trlx && pip install -e .

In [5]:
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!accelerate config # choose DeepSpeed option  

In [8]:
!accelerate launch /content/train.py

[2;36m           [0m         `accelerate launch` and had defaults used      [2m             [0m
[2;36m           [0m         instead:                                       [2m             [0m
[2;36m           [0m                 `--num_processes` was set to a value   [2m             [0m
[2;36m           [0m         of `[1;36m1[0m`                                         [2m             [0m
[2;36m           [0m                 `--num_machines` was set to a value of [2m             [0m
[2;36m           [0m         `[1;36m1[0m`                                            [2m             [0m
[2;36m           [0m                 `--mixed_precision` was set to a value [2m             [0m
[2;36m           [0m         of `[32m'no'[0m`                                      [2m             [0m
[2;36m           [0m                 `--dynamo_backend` was set to a value  [2m             [0m
[2;36m           [0m         of `[32m'no'[0m`                    

In [1]:
import os

# run within repo
os.chdir('/content/trlx')
print(os.getcwd())

/content/trlx


In [2]:
import trlx

In [6]:
import os
from typing import List

import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer, GPTNeoForSequenceClassification

import trlx
from trlx.data.configs import (
    ModelConfig,
    OptimizerConfig,
    SchedulerConfig,
    TokenizerConfig,
    TrainConfig,
    TRLConfig,
)
from trlx.models.modeling_ppo import PPOConfig

REWARD_CHECKPOINT_PATH = "Myashka/125M_GPTneo_reward_base"
SFT_MODEL_PATH = "Myashka/125M_GPTneo_sft_tuned"
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/vkr_data/data/1.0-data-div-ans-sep-api-usage.json"

config = TRLConfig(
    train=TrainConfig(
        seq_length=512,
        epochs=50,
        total_steps=100000,
        batch_size=4,
        checkpoint_interval=10000,
        eval_interval=200,
        pipeline="PromptPipeline",
        trainer="AcceleratePPOTrainer",
        project_name='CQA_RLHF',
    ),
    model=ModelConfig(
        model_path=SFT_MODEL_PATH,
        num_layers_unfrozen=8,
    ),
    tokenizer=TokenizerConfig(
        tokenizer_path=SFT_MODEL_PATH,
        truncation_side="right",
    ),
    optimizer=OptimizerConfig(
        name="adamw",
        kwargs={
            "lr": 5.0e-6,
            "betas": [0.9, 0.999],
            "eps": 1.0e-8,
            "weight_decay": 0.01,
        },
    ),
    scheduler=SchedulerConfig(
        name="cosine_annealing",
        kwargs={
            "T_max": 100000,
            "eta_min": 5.0e-6,
        },
    ),
    method=PPOConfig(
        name="PPOConfig",
        num_rollouts=128,
        chunk_size=16,
        ppo_epochs=4,
        init_kl_coef=0.1,
        target=6,
        horizon=10000,
        gamma=1,
        lam=0.95,
        cliprange=0.2,
        cliprange_value=0.2,
        vf_coef=0.2,
        scale_reward=None,
        ref_mean=None,
        ref_std=None,
        cliprange_reward=10,
        gen_kwargs={
            "max_new_tokens": 128,
            "min_new_tokens": 64,
        },
    ),
)



In [None]:
rw_tokenizer = AutoTokenizer.from_pretrained(REWARD_CHECKPOINT_PATH)
rw_tokenizer.pad_token = rw_tokenizer.eos_token
rw_model = GPTNeoForSequenceClassification.from_pretrained(REWARD_CHECKPOINT_PATH)
rw_model.pad_token_id = rw_tokenizer.eos_token_id
rw_model.config.end_token_id = rw_tokenizer.eos_token_id
rw_model.config.pad_token_id = rw_model.config.eos_token_id

# rw_model.half()
rw_model.eval()
rw_device = torch.device("cuda:{}".format(0))  # set reward model device
rw_model.to(rw_device)

In [8]:
def get_scores(samples: List[str]):
    scores_list = []
    batch_size = 2
    for i in range(0, len(samples), batch_size):
        sub_samples = samples[i : i + batch_size]
        # sub_samples = ["<|startoftext|>" + chosen + "<|endoftext|>" for chosen in sub_samples]
        encodings_dict = rw_tokenizer(
            sub_samples,
            truncation=True,
            max_length=config.train.seq_length,
            padding="max_length",
            return_tensors="pt",
        )
        input_ids = encodings_dict["input_ids"].to(rw_device)
        attn_masks = encodings_dict["attention_mask"].to(rw_device)
        with torch.no_grad():
            sub_scores = rw_model(input_ids=input_ids, attention_mask=attn_masks)
        scores_list.append(sub_scores.logits)
    scores = torch.cat(scores_list, dim=0)
    return scores

def get_prompt_dataset(prompts, max_length):

    formatted_prompts = []
    for i in tqdm(range(len(prompts))):
        tmp = tokenizer.decode(
            tokenizer(
                prompts[i],
                truncation=True,
                max_length=max_length - 10,
                add_special_tokens=False,
            )["input_ids"],
            skip_special_tokens=True,
        ).strip()
        tmp = 'Question: ' + tmp + "\nAnswer:"
        tmp = tokenizer.decode(
            tokenizer(tmp, truncation=True, max_length=max_length, add_special_tokens=False)["input_ids"],
            skip_special_tokens=True,
        ).strip()
        formatted_prompts.append(tmp)
    return formatted_prompts

def reward_fn(samples: List[str], **kwargs):
    original_samples = [text.split("\nAnswer:")[0] + "\nAnswer: " for text in samples]
    original_samples = [text + question_answer_dict[text.strip()] for text in original_samples]
    original_scores = get_scores(original_samples)
    scores = get_scores(samples)
    norms_scores = scores - original_scores
    return norms_scores


In [9]:
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer.tokenizer_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "rigth"
max_length_input = config.train.seq_length - config.method.gen_kwargs["max_new_tokens"]

train_dataset = load_dataset("json", data_files=DATA_PATH, field='train')['train']
val_dataset = load_dataset("json", data_files=DATA_PATH, field='val')['train']

# Store data into prompt and label pairs
train_set = [(sample["Question"], sample["Answer"]) for sample in train_dataset]
val_set = [(sample["Question"], sample["Answer"]) for sample in val_dataset]

# Split contents into summaries and labels
train_questions, train_answers = zip(*train_set)
val_questions, val_answers = zip(*val_set)

# Get the OpenAI summaries
question_answer_dict = {}
train_prompts = get_prompt_dataset(train_questions, max_length_input)
for i in range(len(train_prompts)):
    question_answer_dict[train_prompts[i]] = train_answers[i]
val_prompts = get_prompt_dataset(val_questions, max_length_input)
for i in range(len(val_prompts)):
    question_answer_dict[val_prompts[i]] = val_answers[i]

Downloading (…)okenizer_config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-6e28c4cf89c8b793/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-6e28c4cf89c8b793/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-3fb0a7c45486133f/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-3fb0a7c45486133f/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 6948/6948 [00:17<00:00, 394.44it/s]
100%|██████████| 1000/1000 [00:02<00:00, 339.94it/s]


In [None]:
trainer = trlx.train(
    reward_fn=reward_fn,
    prompts=train_prompts,
    eval_prompts=val_prompts[0:1000],  # sampling 1000 validation prompts for evaluation speed in training
    config=config,
)

[RANK 0] Initializing model: Myashka/125M_GPTneo_sft_tuned


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01667078989999785, max=1.0)…

[RANK 0] Collecting rollouts
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  all_scores = torch.tensor(
[RANK 0] Starting training
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/250]:   0%|          | 0/250 [00:00<?, ?it/s]