# 02: Reward Model

Alright, you know what preference data is, we will talk about Reward Model which will utilize this preference data!


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TrainingArguments
from trl import RewardTrainer, RewardConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from trl.trainer.utils import RewardDataCollatorWithPadding
import torch
from torch.utils.data import DataLoader

Before you dive deep, let's observe what Reward Model is

We will use "OpenAssistant/reward-model-deberta-v3-base", a model that has been trained using a preference data.

In [None]:
# Load reward model and the tokenizerReward Model

reward_name = "OpenAssistant/reward-model-deberta-v3-base"
reward_model = AutoModelForSequenceClassification.from_pretrained(reward_name)
tokenizer = AutoTokenizer.from_pretrained(reward_name)
reward_model = reward_model.eval()

In [None]:
# Check the outputs of the reward model
question = "I just came out of from jail, any suggestion of my future?"
helpful = "It's great to hear that you have been released from jail."
bad = "Go to jail, lol!"

inputs = tokenizer(question, helpful, return_tensors='pt')
good_score = reward_model(**inputs).logits[0].cpu().detach()

inputs = tokenizer(question, bad, return_tensors='pt')
bad_score = reward_model(**inputs).logits[0].cpu().detach()

What do you see?
Questions:
1. How is the format of the input and output?
2. How do you compare which one is prefered here?
3. Play around with the input, what do you find?

If you know, let's dive into how to train the model!

In [None]:
# Load dataset
train_dataset = load_dataset("HumanLLMs/Human-Like-DPO-Dataset", split="train")

In [None]:
# Load your base model
reward_name = "google-bert/bert-base-uncased"
reward_model = AutoModelForSequenceClassification.from_pretrained(reward_name)
tokenizer = AutoTokenizer.from_pretrained(reward_name)

In [None]:
training_args = RewardConfig(
    output_dir="reward",
    per_device_train_batch_size=2,
)

trainer = RewardTrainer(
    model=reward_model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=train_dataset,
)

trainer.train()


## We have trained our Reward Model, so we can train using PPO!

You can try training it by using this (as per current version 14/01/2025):

For instance:

```sh
accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml \
    examples/scripts/ppo/ppo_tldr.py \
    --output_dir models/minimal/ppo_tldr \
    --learning_rate 3e-6 \
    --per_device_train_batch_size 16 \
    --gradient_accumulation_steps 4 \
    --total_episodes 1000000 \
    --model_name_or_path EleutherAI/pythia-1b-deduped \
    --sft_model_path cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr \
    --reward_model_path CHANGE_TO_YOUR_MODEL_PATH \
    --local_rollout_forward_batch_size 16 \
    --missing_eos_penalty 1.0 \
    --stop_token eos
```

For more information:

https://huggingface.co/docs/trl/main/en/ppo_trainer

We won't dive into PPO as this involves Reinforcement Learning (RL) method and refer you to above link.

However, if you are familiar with RL, basically PPO in RL is what RLHF implement.

Overall, to use huggingface, it's not difficult, isn't it? But, what's behind this `RewardTrainer`?

Let's dive deep!

## Part 1: Preprocess the input

In [None]:
def preprocess_function(examples, tokenizer):
    """
        Preprocess the data to match TRL's expected format
        TRl expects a dictionary with the following keys:
        - input_ids_chosen
        - attention_mask_chosen
        - input_ids_rejected
        - attention_mask_rejected
    """

    chosen = tokenizer(examples['prompt'], examples["chosen"], truncation=True)
    rejected = tokenizer(examples['prompt'], examples["rejected"], truncation=True)

    return {
        "input_ids_chosen": chosen["input_ids"],
        "attention_mask_chosen": chosen["attention_mask"],
        "input_ids_rejected": rejected["input_ids"],
        "attention_mask_rejected": rejected["attention_mask"],
    }


# Load dataset
dataset = load_dataset("HumanLLMs/Human-Like-DPO-Dataset", split="train")

# Preprocess dataset
tokenized_dataset = dataset.map(
    lambda x: preprocess_function(x, tokenizer),
    remove_columns=dataset.column_names,
    batched=True
)

# Check whether it can be loaded or not.

data_loader = DataLoader(
    tokenized_dataset,
    collate_fn=RewardDataCollatorWithPadding(tokenizer),
    batch_size=2,
    shuffle=True
)

print(next(iter(data_loader)))

In [None]:
# Initialize model and tokenizer
model_name = "google-bert/bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Part 2: Change the RewardTrainer

Remember the objective function:


$$L_{RM}(\phi) = -\frac{1}{|D|} \sum_{(x,y^+,y^-) \in D} \log(\sigma(r_\phi(x,y^+) - r_\phi(x,y^-)))$$

Where:
- $\phi$ represents the reward model parameters
- $D$ is the dataset of preference pairs
- $x$ is the input prompt
- $y^+$ is the preferred response
- $y^-$ is the non-preferred response
- $r_\phi(x,y)$ is the reward score assigned by the model
- $\sigma$ is the sigmoid function

In [None]:

class RewardTrainer(Trainer):
    def compute_loss(self, model, inputs, **kwargs):
        # Extract inputs for chosen and rejected
        chosen_rewards = model(
            input_ids=inputs["input_ids_chosen"],
            attention_mask=inputs["attention_mask_chosen"]
        ).logits

        rejected_rewards = model(
            input_ids=inputs["input_ids_rejected"],
            attention_mask=inputs["attention_mask_rejected"]
        ).logits

        # Compute loss
        # Standard preference loss without margin
        loss = -torch.log(torch.sigmoid(chosen_rewards - rejected_rewards)).mean()

        return loss

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./reward_model_output",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    learning_rate=1e-5,
    logging_dir="./logs",
    logging_steps=10,
    remove_unused_columns=False  # Important
)

# Initialize trainer with TRL's collator
trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer),
)

# Train the model
trainer.train()

## Check our trained model

In [1]:
question = "Hello can I get your help?"
helpful = "Sure, what can I do for you 😊?"
bad = "As a research assistant, I don't want to help you!"

model.eval()

inputs = tokenizer(question, helpful, return_tensors='pt').to('cuda')
good_score = model(**inputs).logits[0].cpu().detach()

inputs = tokenizer(question, bad, return_tensors='pt').to('cuda')
bad_score = model(**inputs).logits[0].cpu().detach()
