In [1]:
 ! pip install -U bitsandbytes accelerate transformers datasets trl peft evaluate rouge_score

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.11.1-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    DistilBertTokenizer,
    TrainingArguments,
)
import evaluate
from datasets import load_dataset, Dataset
from trl import (
    SFTTrainer,
    PPOTrainer,
    RewardTrainer,
    PPOConfig,
    RewardConfig,
    AutoModelForCausalLMWithValueHead,
)
from peft import LoraConfig, get_peft_model
from bitsandbytes.optim import AdamW8bit
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset as torchDataset
import numpy as np

# Hyperparameter

In [40]:
dataset = load_dataset("openai/summarize_from_feedback", "comparisons")
base_reward_model_checkpoint = "albert/albert-base-v2"
reward_model_repo_name="albert_reward_model"
reward_model_checkpoint=f"JaishreeramCoder/{reward_model_repo_name}"
output_dir="/content/sample_data"
base_sft_model_checkpoint = "openai-community/gpt2"
sft_model_repo_name = "sft_gpt2_summary"
sft_model_checkpoint=f"JaishreeramCoder/{sft_model_repo_name}"
rlhf_model_repo_name="ppo_gpt2_summary"
rlhf_model_checkpoint=f"JaishreeramCoder/{rlhf_model_repo_name}"
num_train_epochs_reward_model = 5
num_train_epochs_sft = 5
num_train_epochs_ppo_outer=5
ppo_training_batch_size=8
eval_batch_size = 8

In [4]:
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

# Reward Model

In [5]:
reward_tokenizer = AutoTokenizer.from_pretrained(base_reward_model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]



In [6]:
def reward_dataset(data: int):
    (
        input_ids_chosen,
        attention_mask_chosen,
        input_ids_rejected,
        attention_mask_rejected,
    ) = ([], [], [], [])

    for i in range(len(data["summaries"])):
        if data["choice"] == 0:
            chosen = reward_tokenizer(
                data["summaries"][i][0]["text"],
                padding="max_length",
                truncation=True,
                max_length=512,
            )
            rejected = reward_tokenizer(
                data["summaries"][i][1]["text"],
                padding="max_length",
                truncation=True,
                max_length=512,
            )
        else:
            chosen = reward_tokenizer(
                data["summaries"][i][1]["text"],
                padding="max_length",
                truncation=True,
                max_length=512,
            )
            rejected = reward_tokenizer(
                data["summaries"][i][0]["text"],
                padding="max_length",
                truncation=True,
                max_length=512,
            )

        cur_input_ids_chosen, cur_attention_mask_chosen = (
            chosen.input_ids,
            chosen.attention_mask,
        )
        cur_input_ids_rejected, cur_attention_mask_rejected = (
            rejected.input_ids,
            rejected.attention_mask,
        )

        input_ids_chosen.append(cur_input_ids_chosen)
        attention_mask_chosen.append(cur_attention_mask_chosen)
        input_ids_rejected.append(cur_input_ids_rejected)
        attention_mask_rejected.append(cur_attention_mask_rejected)
        output = {
            "input_ids_chosen": input_ids_chosen,
            "attention_mask_chosen": attention_mask_chosen,
            "input_ids_rejected": input_ids_rejected,
            "attention_mask_rejected": attention_mask_rejected,
        }
    return Dataset.from_dict(output)


reward_dataset_train = reward_dataset(dataset["train"][0:1000])
reward_dataset_eval = reward_dataset(dataset["validation"][0:1000])

In [7]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
reward_model = AutoModelForSequenceClassification.from_pretrained(
    base_reward_model_checkpoint, num_labels=7, quantization_config=quantization_config
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
lora_config = LoraConfig(
    r=32,  # Rank for LoRA
    target_modules="all-linear",
)
reward_model = get_peft_model(reward_model, lora_config)

In [9]:
print(count_parameters(reward_model))

(12233959, 544992)


In [10]:
reward_model_param_to_update = []
for param in reward_model.parameters():
    if param.requires_grad == True:
        reward_model_param_to_update.append(param)

optimizers = AdamW8bit(reward_model_param_to_update, lr=2e-5)

reward_training_args = RewardConfig(
    output_dir=output_dir,
    max_length=512,
    num_train_epochs=num_train_epochs_reward_model,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
)
reward_trainer = RewardTrainer(
    args=reward_training_args,
    train_dataset=reward_dataset_train,
    model=reward_model,
    tokenizer=reward_tokenizer,
    optimizers=(optimizers, None),
)
reward_trainer.train()

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
500,0.7008




TrainOutput(global_step=625, training_loss=0.700575390625, metrics={'train_runtime': 1747.8611, 'train_samples_per_second': 2.861, 'train_steps_per_second': 0.358, 'total_flos': 0.0, 'train_loss': 0.700575390625, 'epoch': 5.0})

In [11]:
def evaluate_reward_model(reward_model, reward_dataset_eval):
    with torch.no_grad():
        reward_model.eval()
        reward_value = np.array([])
        total_eval_samples = len(reward_dataset_eval["input_ids_chosen"])
        no_of_correct_prediction = 0
        batch_size = 8
        for i in tqdm(range(0, total_eval_samples, eval_batch_size)):
            input_ids_chosen = torch.tensor(
                reward_dataset_eval["input_ids_chosen"][i : i + eval_batch_size]
            )
            attention_mask_chosen = torch.tensor(
                reward_dataset_eval["attention_mask_chosen"][i : i + eval_batch_size]
            )
            chosen_reward_logits = reward_model(
                input_ids_chosen, attention_mask_chosen
            ).logits
            chosen_reward_score = chosen_reward_logits.argmax(dim=-1)

            input_ids_rejected = torch.tensor(
                reward_dataset_eval["input_ids_rejected"][i : i + eval_batch_size]
            )
            attention_mask_rejected = torch.tensor(
                reward_dataset_eval["attention_mask_rejected"][i : i + eval_batch_size]
            )
            rejected_reward_score = reward_model(
                input_ids_rejected, attention_mask_rejected
            ).logits
            rejected_reward_score = rejected_reward_score.argmax(dim=-1)

            no_of_correct_prediction += sum(
                1
                for j in range(batch_size)
                if chosen_reward_score[j] > rejected_reward_score[j]
            )

        accuracy = (no_of_correct_prediction / total_eval_samples) * 100
        print(f"\nReward Model Accuracy: {accuracy:3f}")


evaluate_reward_model(reward_model, reward_dataset_eval)

100%|██████████| 125/125 [02:34<00:00,  1.23s/it]


Reward Model Accuracy: 30.300000





# Supervised finetuning

In [12]:
sft_tokenizer = AutoTokenizer.from_pretrained(base_sft_model_checkpoint)
sft_tokenizer.pad_token = sft_tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [13]:
def get_sft_dataset(data):
    input_ids, attention_mask, label_ids = ([], [], [])
    for i in range(len(data["choice"])):
        input = f"Summarize the following text:\n\n{data['info'][i]['post']}"
        cur = sft_tokenizer(
            input,
            padding="max_length",
            truncation=True,
            max_length=512,
            padding_side="left",
        )
        cur_input_ids = cur.input_ids
        cur_attention_mask = cur.attention_mask
        completion = (
            data["summaries"][i][1]["text"]
            if data["choice"][i] == 1
            else data["summaries"][i][0]["text"]
        )
        cur_label_ids = sft_tokenizer(
            completion,
            padding="max_length",
            truncation=True,
            max_length=512,
            padding_side="left",
        ).input_ids
        input_ids.append(cur_input_ids)
        attention_mask.append(cur_attention_mask)
        label_ids.append(cur_label_ids)

    output = {
        "input_ids": input_ids,
        "attention_masks": attention_mask,
        "labels": label_ids,
    }
    output = Dataset.from_dict(output)
    return output

In [14]:
sft_train_dataset = get_sft_dataset(dataset["train"][1000:2000])
sft_eval_dataset = get_sft_dataset(dataset["validation"][1000:2000])

In [15]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)
sft_model = AutoModelForCausalLM.from_pretrained(
    base_sft_model_checkpoint,
    quantization_config=quantization_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [16]:
lora_config = LoraConfig(
    r=32,  # Rank for LoRA
    target_modules="all-linear",
)
sft_model = get_peft_model(sft_model, lora_config)

In [17]:
print(count_parameters(sft_model))

(129158400, 4718592)


In [18]:
sft_training_args = TrainingArguments(
    output_dir="/content/sample_data",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=num_train_epochs_sft,
)

param_to_update = []
for param in sft_model.parameters():
    if param.requires_grad == True:
        param_to_update.append(param)

optimizers = AdamW8bit(param_to_update, lr=2e-5)

model_trainer = SFTTrainer(
    model=sft_model,
    tokenizer=sft_tokenizer,
    train_dataset=sft_train_dataset,
    args=sft_training_args,
    optimizers=(optimizers, None),
)



In [19]:
model_trainer.train()

Step,Training Loss
500,4.7875


TrainOutput(global_step=625, training_loss=4.968848828125, metrics={'train_runtime': 591.5546, 'train_samples_per_second': 8.452, 'train_steps_per_second': 1.057, 'total_flos': 1378937733120000.0, 'train_loss': 4.968848828125, 'epoch': 5.0})

In [20]:
rouge_metric = evaluate.load("rouge")
def compute_metrics(decoded_preds, decoded_actual_labels):
    result = rouge_metric.compute(
        predictions=decoded_preds, references=decoded_actual_labels
    )
    print(f"SFT Model ROUGE values: {result}")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [21]:
generation_kwargs = {
    "min_length": -1,  # don't ignore the EOS token
    "top_k": 0.0,  # no top-k sampling
    "top_p": 1.0,  # no nucleus sampling
    "do_sample": True,  # yes, we want to sample
    "eos_token_id": sft_tokenizer.eos_token_id,
    "bos_token_id": sft_tokenizer.bos_token_id,
    "pad_token_id": sft_tokenizer.eos_token_id,  # most decoder models don't have a padding token - use EOS token instead
    "max_new_tokens": 32,  # specify how many tokens you want to generate at most
}

In [22]:
def evaluate_sft_model(sft_model, sft_eval_dataset):
    with torch.no_grad():
        sft_model.eval()
        decoded_preds = []
        decoded_actual_labels = []
        for i in tqdm(range(0, len(sft_eval_dataset["input_ids"]), eval_batch_size)):
            cur_data = torch.tensor(
                sft_eval_dataset["input_ids"][i : i + eval_batch_size]
            )
            cur_preds = sft_model.generate(cur_data, **generation_kwargs)
            cur_preds = cur_preds[:, cur_data.shape[1] :]
            for j in range(eval_batch_size):
                generated_text = sft_tokenizer.decode(
                    cur_preds[j], skip_special_tokens=True
                )
                decoded_preds.append(generated_text)
            cur_actual_label_ids = torch.tensor(
                sft_eval_dataset["labels"][i : i + eval_batch_size]
            )
            for j in range(eval_batch_size):
                decoded_actual_labels.append(
                    sft_tokenizer.decode(
                        cur_actual_label_ids[j], skip_special_tokens=True
                    )
                )
        sft_model_eval_result = compute_metrics(
            decoded_preds=decoded_preds, decoded_actual_labels=decoded_actual_labels
        )


evaluate_sft_model(sft_model, sft_eval_dataset)

  0%|          | 0/125 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 125/125 [06:16<00:00,  3.01s/it]


SFT Model ROUGE values: {'rouge1': 0.024197355848128734, 'rouge2': 0.00045207208597614686, 'rougeL': 0.021501769974862035, 'rougeLsum': 0.021579978361812826}


In [23]:
!nvidia-smi

Sun Oct  6 17:44:21 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P0              28W /  70W |   1657MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# RLHF based finetuning

In [24]:
from huggingface_hub import login
login(token='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')


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [25]:
sft_model=sft_model.merge_and_unload()
sft_model.push_to_hub(sft_model_repo_name)
sft_tokenizer.push_to_hub(sft_model_repo_name)



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/164M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/JaishreeramCoder/sft_gpt2_summary/commit/063f67282853d32b068c17bd82117feb2089498f', commit_message='Upload tokenizer', commit_description='', oid='063f67282853d32b068c17bd82117feb2089498f', pr_url=None, pr_revision=None, pr_num=None)

In [44]:
# reward_model=reward_model.merge_and_unload()
reward_model.push_to_hub(reward_model_repo_name)
reward_tokenizer.push_to_hub(reward_model_repo_name)

model.safetensors:   0%|          | 0.00/15.7M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/JaishreeramCoder/albert_reward_model/commit/fea744789f6428d365ad81b473f5ea9d9829c27c', commit_message='Upload tokenizer', commit_description='', oid='fea744789f6428d365ad81b473f5ea9d9829c27c', pr_url=None, pr_revision=None, pr_num=None)

In [26]:
# reward_model=AutoModelForSequenceClassification.from_pretrained(reward_model_checkpoint)
# reward_tokenizer=AutoTokenizer.from_pretrained.from_pretrained(reward_model_checkpoint)
#reward_model.eval()
#look for pipeline rather than this

In [27]:
lora_config = LoraConfig(
    r=32,  # Rank for LoRA
    # target_modules="",
)
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [28]:
rlhf_tokenizer = AutoTokenizer.from_pretrained(sft_model_checkpoint)
rlhf_tokenizer.pad_token = rlhf_tokenizer.eos_token

rlhf_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    sft_model_checkpoint, quantization_config=quantization_config,peft_config=lora_config
)
rlhf_model.train()

tokenizer_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/164M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]



AutoModelForCausalLMWithValueHead(
  (pretrained_model): PeftModel(
    (base_model): LoraModel(
      (model): GPT2LMHeadModel(
        (transformer): GPT2Model(
          (wte): Embedding(50257, 768)
          (wpe): Embedding(1024, 768)
          (drop): Dropout(p=0.1, inplace=False)
          (h): ModuleList(
            (0-11): 12 x GPT2Block(
              (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (attn): GPT2SdpaAttention(
                (c_attn): lora.Linear8bitLt(
                  (base_layer): Linear8bitLt(in_features=768, out_features=2304, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Identity()
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=32, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=32, out_features=2304, bias=False)
                  

In [29]:
def reward_fn(response):
    """
    Takes single text as input and returns the reward score
    """
    with torch.no_grad():
        reward_model.eval()
        input_text = reward_tokenizer.decode(
            response, skip_special_tokens=True
        )  # skips eos, bos, pad token
        input = reward_tokenizer(
            input_text,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt",
        )
        logits = reward_model(**input).logits
        predicted_score = torch.tensor(logits.argmax(dim=-1),dtype=torch.float32)
        return predicted_score

In [30]:
def get_rlhf_dataset(data):
    input_ids, attention_mask = ([], [])
    for i in range(len(data["choice"])):
        input = f"Summarize the following text:\n\n{data['info'][i]['post']}"
        cur = rlhf_tokenizer(
            input,
            padding="max_length",
            truncation=True,
            max_length=512,
            padding_side="left",
        )
        cur_input_ids = cur.input_ids
        cur_attention_mask = cur.attention_mask
        input_ids.append(cur_input_ids)
        attention_mask.append(cur_attention_mask)
    output = {"input_ids": input_ids, "attention_mask": attention_mask}
    output = Dataset.from_dict(output)
    return output

In [31]:
rlhf_train_dataset = get_rlhf_dataset(dataset["train"][2000:3000])
rlhf_eval_dataset = get_rlhf_dataset(dataset["validation"][2000:3000])

In [32]:
generation_kwargs = {
    "min_length": -1,  # don't ignore the EOS token
    "top_k": 0.0,  # no top-k sampling
    "top_p": 1.0,  # no nucleus sampling
    "do_sample": True,  # yes, we want to sample
    "eos_token_id": rlhf_tokenizer.eos_token_id,
    "bos_token_id": rlhf_tokenizer.bos_token_id,
    "pad_token_id": rlhf_tokenizer.eos_token_id,  # most decoder models don't have a padding token - use EOS token instead
    "max_new_tokens": 32,  # specify how many tokens you want to generate at most
}

In [33]:
def evaluate(model, data, reward_model):
    with torch.no_grad():
        model.eval()
        reward_model.eval()
        reward_value = []
        data_size = len(data["input_ids"])
        for i in tqdm(range(0, data_size, eval_batch_size)):
            cur_data = torch.tensor(data["input_ids"][i : i + eval_batch_size])
            response = model.generate(cur_data, **generation_kwargs)
            response = response[: cur_data.shape[1] :]
            for j in range(eval_batch_size):
                reward_value.append(reward_fn(response[j]))
        avg_reward = np.mean(np.array(reward_value))
        return avg_reward


sft_avg_reward = evaluate(sft_model, rlhf_eval_dataset, reward_model)
print(f"Average Reward for supervised finetuned model: {sft_avg_reward}")

  predicted_score = torch.tensor(logits.argmax(dim=-1),dtype=torch.float32)
100%|██████████| 125/125 [06:58<00:00,  3.35s/it]

Average Reward for supervised finetuned model: 5.927999973297119





In [34]:
print(count_parameters(rlhf_model))

(125620225, 1180417)


In [35]:
ppo_config = PPOConfig(
    gradient_accumulation_steps=ppo_training_batch_size,
    batch_size=ppo_training_batch_size,
    mini_batch_size=1,
    model_name="GPT2",
    is_peft_model=True,
)
ppo_trainer = PPOTrainer(
    model=rlhf_model,
    config=ppo_config,
    dataset=rlhf_train_dataset,
    tokenizer=rlhf_tokenizer,
)



In [36]:
for epoch in range(num_train_epochs_ppo_outer):
    for i in tqdm(range(0,len(rlhf_eval_dataset),ppo_training_batch_size)):
        reward_value=[]
        data=torch.tensor(rlhf_eval_dataset["input_ids"][i:i+ppo_training_batch_size])
        data=[data[j] for j in range(ppo_training_batch_size)]
        responses=ppo_trainer.generate(data,**generation_kwargs,return_prompt=False)
        for j in range(ppo_training_batch_size):
            cur_reward=reward_fn(responses[j])
            reward_value.append(cur_reward)
        ppo_trainer.step(queries=data,responses=responses,scores=reward_value)

  0%|          | 0/125 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  predicted_score = torch.tensor(logits.argmax(dim=-1),dtype=torch.float32)
100%|██████████| 125/125 [18:23<00:00,  8.83s/it]
100%|██████████| 125/125 [18:20<00:00,  8.80s/it]
100%|██████████| 125/125 [18:12<00:00,  8.74s/it]
100%|██████████| 125/125 [18:11<00:00,  8.73s/it]
100%|██████████| 125/125 [19:51<00:00,  9.53s/it]


In [37]:
rlhf_avg_reward = evaluate(ppo_trainer.model, rlhf_eval_dataset, reward_model)
print(f"Average Reward for Supervised finetuned model: {sft_avg_reward:3f}")
print(f"Average Reward for RLHF finetuned model: {rlhf_avg_reward:3f}")

  predicted_score = torch.tensor(logits.argmax(dim=-1),dtype=torch.float32)
100%|██████████| 125/125 [08:56<00:00,  4.29s/it]

Average Reward for Supervised finetuned model: 5.928000
Average Reward for RLHF finetuned model: 5.928000





In [43]:
#rlhf_model.merge_and_unload()
rlhf_model.push_to_hub(rlhf_model_repo_name)
rlhf_tokenizer.push_to_hub(rlhf_model_repo_name)

adapter_model.safetensors:   0%|          | 0.00/4.72M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/JaishreeramCoder/ppo_gpt2_summary/commit/521f7de37fdddb5766d77ed2f0388bf0d7123752', commit_message='Upload tokenizer', commit_description='', oid='521f7de37fdddb5766d77ed2f0388bf0d7123752', pr_url=None, pr_revision=None, pr_num=None)