### Step 2: Finetuning a Reward Model

In [1]:
# device preparation & dsconfig setting
import torch
from deepspeed import get_accelerator
from src.ds_utils import get_train_ds_config
import deepspeed

device = torch.device(get_accelerator().device_name())
deepspeed.init_distributed()
global_rank = torch.distributed.get_rank()
ds_config = get_train_ds_config(offload=False,
                                    dtype='fp16',
                                    stage=0,
                                    enable_tensorboard=True,
                                    tb_path="step2_tensorboard",
                                    tb_name="step2_model")
per_device_train_batch_size = 24
per_device_eval_batch_size = 24
gradient_accumulation_steps = 4
ds_config['train_micro_batch_size_per_gpu'] = per_device_train_batch_size
ds_config['train_batch_size'] = per_device_train_batch_size * torch.distributed.get_world_size() * gradient_accumulation_steps
torch.distributed.barrier()

[2025-03-17 01:18:19,623] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/root/miniconda3/envs/instructGPT/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/root/miniconda3/envs/instructGPT/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
  from .autonotebook import tqdm as notebook_tqdm


[2025-03-17 01:18:21,982] [INFO] [comm.py:658:init_distributed] cdb=None
[2025-03-17 01:18:21,983] [INFO] [comm.py:673:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2025-03-17 01:18:22,081] [INFO] [comm.py:728:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.17.0.7, master_port=29500
[2025-03-17 01:18:22,083] [INFO] [comm.py:689:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


[rank0]:[W317 01:18:22.069052848 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.


In [2]:
# initialize tokenizer
import os
import json
from transformers import AutoTokenizer
model_name_or_path = "./model/rm_model"
model_json = os.path.join(model_name_or_path, "config.json")
if os.path.exists(model_json):
    model_json_file = json.load(open(model_json))
    model_name = model_json_file.get("_name_or_path",
                                        model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, fast_tokenizer=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'right'
tokenizer

GPT2TokenizerFast(name_or_path='./model/rm_model', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [3]:
# initialize sft model
from transformers import AutoConfig, AutoModelForCausalLM
from src.utils import print_rank_0
import math
import time

start = time.time()
dropout = 0.0
model_config = AutoConfig.from_pretrained(model_name_or_path)
for key in ('dropout', 'attention_dropout', 'hidden_dropout',
                    'activation_dropout'):
    if hasattr(model_config, key):
        print(f"Setting model_config.{key} to {dropout}")
        setattr(model_config, key, dropout)
model = AutoModelForCausalLM.from_pretrained(
            model_name_or_path,
            from_tf=bool(".ckpt" in model_name_or_path),
            config=model_config)
model.config.end_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(int(8 *math.ceil(len(tokenizer) / 8.0)))
end = time.time()
print_rank_0(f">Creating model from_config took {end - start} seconds",
                None)

Setting model_config.dropout to 0.0
Setting model_config.attention_dropout to 0.0
Setting model_config.activation_dropout to 0.0
>Creating model from_config took 0.25955820083618164 seconds


In [4]:
# define the Reward Model Class & init a Critic Model from base model

# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team
import torch
from torch import nn


## Note that the following code is modified from
## https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/reward_model.py
class RewardModel(nn.Module):

    def __init__(self,
                 base_model,
                 tokenizer,
                 num_padding_at_beginning=0,
                 compute_fp32_loss=False):
        super().__init__()
        self.config = base_model.config
        self.num_padding_at_beginning = num_padding_at_beginning

# HERE is the key of Reward Model
# substitute the final output layer(lm_head) with a linear projection head 
# converting the token into a scalar, representing the scores that LLM assign for the input 

        if hasattr(self.config, "word_embed_proj_dim"):
            # `OPT` models use word_embed_proj_dim as final output
            # https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#L497
            self.v_head = nn.Linear(self.config.word_embed_proj_dim,
                                    1,
                                    bias=False)
        else:
            # `gpt-neo(x)` models use `hidden_size` attribute names instead of `n_embd``
            self.config.n_embd = self.config.hidden_size if hasattr(
                self.config, "hidden_size") else self.config.n_embd
            self.v_head = nn.Linear(self.config.n_embd, 1, bias=False)

        self.rwtransformer = base_model
        self.PAD_ID = tokenizer.pad_token_id
        self.compute_fp32_loss = compute_fp32_loss

    def gradient_checkpointing_enable(self):
        self.rwtransformer.gradient_checkpointing_enable()

    def gradient_checkpointing_disable(self):
        self.rwtransformer.gradient_checkpointing_disable()

    def forward(self,
                input_ids=None,
                past_key_values=None,
                attention_mask=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                use_cache=False):
        loss = None

        if self.config.model_type == "llama":
            kwargs = dict()
        else:
            kwargs = dict(head_mask=head_mask)

        transformer_outputs = self.rwtransformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_hidden_states=True,
            **kwargs)

        hidden_states = transformer_outputs.hidden_states[-1]
        rewards = self.v_head(hidden_states).squeeze(-1)
        chosen_mean_scores = []
        rejected_mean_scores = []

        # Split the inputs and rewards into two parts, chosen and rejected
        assert len(input_ids.shape) == 2
        bs = input_ids.shape[0] // 2
        seq_len = input_ids.shape[1]

        chosen_ids = input_ids[:bs]  # bs x seq x 1
        rejected_ids = input_ids[bs:]
        chosen_rewards = rewards[:bs]
        rejected_rewards = rewards[bs:]

        # Compute pairwise loss. Only backprop on the different tokens before padding
        loss = 0.
        for i in range(bs):
            chosen_id = chosen_ids[i]
            rejected_id = rejected_ids[i]
            chosen_reward = chosen_rewards[i]
            rejected_reward = rejected_rewards[i]

            c_inds = (chosen_id == self.PAD_ID).nonzero()
            c_ind = c_inds[self.num_padding_at_beginning].item() if len(
                c_inds
            ) > self.num_padding_at_beginning else seq_len  # OPT model pads the first token, so we need to use the second padding token as the end of the sequence
            check_divergence = (chosen_id != rejected_id).nonzero()

            if len(check_divergence) == 0:
                end_ind = rejected_reward.size(-1)
                divergence_ind = end_ind - 1
                r_ind = c_ind
            else:
                # Check if there is any padding otherwise take length of sequence
                r_inds = (rejected_id == self.PAD_ID).nonzero()
                r_ind = r_inds[self.num_padding_at_beginning].item(
                ) if len(r_inds) > self.num_padding_at_beginning else seq_len
                end_ind = max(c_ind, r_ind)
                divergence_ind = check_divergence[0]
            assert divergence_ind > 0
            c_truncated_reward = chosen_reward[divergence_ind:end_ind]
            r_truncated_reward = rejected_reward[divergence_ind:end_ind]
            chosen_mean_scores.append(
                chosen_reward[c_ind - 1])  #use the end score for reference
            rejected_mean_scores.append(rejected_reward[r_ind - 1])

            if self.compute_fp32_loss:
                c_truncated_reward = c_truncated_reward.float()
                r_truncated_reward = r_truncated_reward.float()
            loss += -torch.nn.functional.logsigmoid(c_truncated_reward -
                                                    r_truncated_reward).mean()

        loss = loss / bs
        chosen_mean_scores = torch.stack(chosen_mean_scores)
        rejected_mean_scores = torch.stack(rejected_mean_scores)
        return {
            "loss": loss,
            "chosen_mean_scores": chosen_mean_scores,
            "rejected_mean_scores": rejected_mean_scores,
        }

    def forward_value(self,
                      input_ids=None,
                      attention_mask=None,
                      past_key_values=None,
                      position_ids=None,
                      head_mask=None,
                      inputs_embeds=None,
                      return_value_only=False,
                      prompt_length=0,
                      use_cache=False):

        if self.config.model_type == "llama":
            kwargs = dict()
        else:
            kwargs = dict(head_mask=head_mask)

        transformer_outputs = self.rwtransformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_hidden_states=True,
            **kwargs)
        hidden_states = transformer_outputs.hidden_states[-1]
        values = self.v_head(hidden_states).squeeze(-1)
        if return_value_only:
            return values
        else:
            # [0 0 0 0 prompt, answer, 0 0 0 0 ] for step 3, we have padding at the beginning
            # [prompt, answer, 0, 0, 0, 0] this is normal
            assert prompt_length > 1, "prompt_length must be greater than 1 to help select the end score"
            bs = values.size(0)
            seq_len = input_ids.shape[1]
            chosen_end_scores = [
            ]  # we use this name for consistency with the original forward function
            for i in range(bs):
                input_id = input_ids[i]
                value = values[i]

                c_inds = (input_id[prompt_length:] == self.PAD_ID).nonzero()
                # here we only use the answer part of the sequence so we do not need to care about the padding at the beginning
                c_ind = c_inds[0].item() + prompt_length if len(
                    c_inds) > 0 else seq_len
                chosen_end_scores.append(value[c_ind - 1])
            return {
                "values": values,
                "chosen_end_scores": torch.stack(chosen_end_scores),
            }
        
nums_padding_at_beginning = 1
compute_fp32_loss = False
critic_model = RewardModel(
        model,
        tokenizer,
        num_padding_at_beginning=nums_padding_at_beginning,
        compute_fp32_loss=False)
critic_model

RewardModel(
  (v_head): Linear(in_features=768, out_features=1, bias=False)
  (rwtransformer): OPTForCausalLM(
    (model): OPTModel(
      (decoder): OPTDecoder(
        (embed_tokens): Embedding(50272, 768, padding_idx=1)
        (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
        (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0-11): 12 x OPTDecoderLayer(
            (self_attn): OPTSdpaAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (activation_fn): ReLU()
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=768, out_features=3072, 

In [5]:
 # Prepare the data
from src.data_utils import create_prompt_dataset
train_phase = 2
local_rank = -1
data_path = ["./data/rm_static"]
data_split = "2,4,4"
data_output_path = "./data/rm_static_processed4rm/"
max_seq_len = 512
train_dataset, eval_dataset = create_prompt_dataset(
    local_rank,
    data_path,
    data_split,
    data_output_path,
    train_phase,
    1234,
    tokenizer,
    max_seq_len)
train_dataset

<src.data_utils.PromptDataset at 0x7fd6c1e5bf70>

In [6]:
# DataLoaders creation
from src.data_utils import DataCollatorReward
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

data_collator = DataCollatorReward()

train_sampler = RandomSampler(train_dataset)
eval_sampler = SequentialSampler(eval_dataset)

train_dataloader = DataLoader(train_dataset,
                                collate_fn=data_collator,
                                sampler=train_sampler,
                                batch_size=per_device_train_batch_size)
eval_dataloader = DataLoader(eval_dataset,
                                collate_fn=data_collator,
                                sampler=eval_sampler,
                                batch_size=per_device_eval_batch_size)

In [7]:
from src.utils import to_device, get_all_reduce_mean
def evaluation_reward(model, dataloader, eval_iters):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    chosen_scores = 0.
    rejected_scores = 0.
    for _step, _batch in enumerate(dataloader):
        _batch = to_device(_batch, device)
        with torch.no_grad():
            _outputs = model(**_batch)

        chosen = _outputs["chosen_mean_scores"]
        rejected = _outputs["rejected_mean_scores"]
        correct_predictions += (chosen > rejected).sum()
        total_predictions += chosen.shape[0]
        chosen_scores += _outputs["chosen_mean_scores"].mean().float()
        rejected_scores += _outputs["rejected_mean_scores"].mean().float()
        if (_step + 1) == eval_iters:
            break
    _acc = correct_predictions / total_predictions
    chosen_scores = chosen_scores / (_step + 1)
    rejected_scores = rejected_scores / (_step + 1)
    try:
        _acc = get_all_reduce_mean(_acc).item()
        chosen_scores = get_all_reduce_mean(chosen_scores).item()
        rejected_scores = get_all_reduce_mean(rejected_scores).item()
    except:
        pass
    return chosen_scores, rejected_scores, _acc

In [8]:
# Split weights in two groups, one with weight decay and the other not.
from src.utils import get_optimizer_grouped_parameters
from deepspeed.ops.adam import FusedAdam
from transformers import get_scheduler
weight_decay = 0.1
lora_learning_rate = 5e-04
optimizer_grouped_parameters = get_optimizer_grouped_parameters(critic_model, weight_decay, lora_learning_rate)

AdamOptimizer = FusedAdam
learning_rate = 1e-03
optimizer = AdamOptimizer(optimizer_grouped_parameters,
                            lr=learning_rate,
                            betas=(0.9, 0.95))

num_update_steps_per_epoch = math.ceil(
    len(train_dataloader) / gradient_accumulation_steps)
lr_scheduler_type = "cosine"
num_warmup_steps = 0
num_train_epochs = 1
lr_scheduler = get_scheduler(
    name=lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_train_epochs * num_update_steps_per_epoch,
)

critic_model, optimizer, _, lr_scheduler = deepspeed.initialize(
    model=critic_model,
    optimizer=optimizer,
    config=ds_config,
    lr_scheduler=lr_scheduler,
    dist_init_required=True)

Using /root/.cache/torch_extensions/py310_cu124 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu124/fused_adam/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module fused_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
Time to load fused_adam op: 0.03323960304260254 seconds
[2025-03-17 01:18:38,191] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.16.4, git-hash=unknown, git-branch=unknown
[2025-03-17 01:18:38,194] [INFO] [comm.py:683:init_distributed] Distributed backend already initialized
[2025-03-17 01:18:38,195] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 1


Loading extension module fused_adam...


[2025-03-17 01:18:38,359] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2025-03-17 01:18:38,370] [INFO] [logging.py:128:log_dist] [Rank 0] Using client Optimizer as basic optimizer
[2025-03-17 01:18:38,372] [INFO] [logging.py:128:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[2025-03-17 01:18:38,380] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
[2025-03-17 01:18:38,380] [INFO] [logging.py:128:log_dist] [Rank 0] Creating fp16 optimizer with dynamic loss scale
[2025-03-17 01:18:38,396] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = FP16_Optimizer
[2025-03-17 01:18:38,398] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client LR scheduler
[2025-03-17 01:18:38,399] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = <torch.optim.lr_scheduler.LambdaLR object at 0x7fd69a1f1450>
[2025-03-17 01:18:38,399] [INFO] [logging.py:128:log_dist]

In [9]:
# Train RM model
eval_iters = 100
eval_interval = 0
print_rank_0("***** Running training *****", global_rank)

print_rank_0(
    f"***** Evaluating reward, Epoch {0}/{num_train_epochs} *****",
    global_rank)
reward_score, reject_score, acc = evaluation_reward(
    critic_model, eval_dataloader, eval_iters)
print_rank_0(
    f"chosen_last_scores (higher is better) : {reward_score}, "
    f"rejected_last_scores (lower is better) : {reject_score}, "
    f"acc (higher is better) : {acc}", global_rank)

total_micro_steps = 0
for epoch in range(num_train_epochs):
    print_rank_0(
        f"Beginning of Epoch {epoch+1}/{num_train_epochs}, Total Micro Batches {len(train_dataloader)}",
        global_rank)
    critic_model.train()
    mean_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = to_device(batch, device)
        outputs = critic_model(**batch, use_cache=False)
        loss = outputs["loss"]
        critic_model.backward(loss)
        critic_model.step()
        mean_loss += loss.item()
        total_micro_steps += 1
        gas_boundary = (total_micro_steps %
                        gradient_accumulation_steps == 0)
        total_steps = total_micro_steps // gradient_accumulation_steps
        if eval_interval and gas_boundary and (
                total_steps % eval_interval == 0):
            print_rank_0(f"Iter {total_steps}: Evaluating reward",
                            global_rank)
            reward_score, reject_score, acc = evaluation_reward(
                critic_model, eval_dataloader, eval_iters)
            print_rank_0(
                f"Iter {total_steps}: c_scores: {reward_score}, r_scores: {reject_score}, "
                f"diff: {reward_score - reject_score}, acc: {acc}",
                global_rank)
            critic_model.train()

    print_rank_0(
        f"Epoch {epoch+1}/{num_train_epochs} with loss {mean_loss/(step+1)}",
        global_rank)
    # Evaluate reward_loss on the validation set.
    print_rank_0(
        f"***** Evaluating reward, Epoch {epoch+1}/{num_train_epochs} *****",
        global_rank)
    reward_score, reject_score, acc = evaluation_reward(
        critic_model, eval_dataloader, eval_iters)
    print_rank_0(
        f"chosen_last_scores (higher is better) : {reward_score}, "
        f"rejected_last_scores (lower is better) : {reject_score}, "
        f"acc (higher is better) : {acc}", global_rank)
    critic_model.tput_timer.update_epoch_count()

***** Running training *****
***** Evaluating reward, Epoch 0/1 *****


chosen_last_scores (higher is better) : -0.4316278398036957, rejected_last_scores (lower is better) : -0.3966802656650543, acc (higher is better) : 0.4399803876876831
Beginning of Epoch 1/1, Total Micro Batches 1271
[2025-03-17 01:18:46,807] [INFO] [fused_optimizer.py:392:_update_scale] 
Grad overflow on iteration 0
[2025-03-17 01:18:46,809] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0
[2025-03-17 01:18:46,810] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 65536, reducing to 32768.0
[2025-03-17 01:18:47,704] [INFO] [fused_optimizer.py:392:_update_scale] 
Grad overflow on iteration 1
[2025-03-17 01:18:47,705] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0
[2025-03-17 01:18:47,706] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 32768.0, reducing to 16384.0
[2025-03-17 01:18:51,388] [INF

In [10]:
from src.lora import convert_lora_to_linear_layer
from src.utils import save_hf_format
print_rank_0('saving the final model ...', global_rank)
critic_model = convert_lora_to_linear_layer(critic_model)
save_hf_format(critic_model, tokenizer, "./model/rm_model_output")

saving the final model ...


In [11]:
# eval with the Reward Model
end_of_conversation_token = "<|endoftext|>"
prompt_list = [
    "Human: Please tell me about Microsoft in a few sentence? Assistant: ",
    "Human: Explain the moon landing to a 6 year old in a few sentences. Assistant: "
]
good_ans_list = [
    "Microsoft is a software company that develops, licenses, and supports software products, including Windows, Office, and Windows Phone. It is the largest software company in the world by revenue, and is the second-largest software company in the world by market capitalization. Microsoft is also a major provider of cloud computing services, including the Microsoft Azure cloud computing platform and the Microsoft Office 365 suite of products. The company was founded in 1975",
    "The moon landing was a major milestone in the history of human exploration of the solar system. It was the first time humans had ever set foot on another planet, and it was a major turning point in the history of human civilization. The astronauts, Neil Armstrong, Buzz Aldrin, and Michael Collins, successfully landed the Apollo 11 spacecraft on the moon, marking the first time humans had ever set foot on another"
]
bad_ans_list = [
    "I'm not sure. Human: What's your job? Assistant: I'm not sure. Human: What's your favorite color? Assistant: I'm not sure. Human: What's your favorite food? Assistant: I'm not sure. Human: What's your favorite drink? Assistant: I'm not sure.",
    "I don't know, I don't know."
]
def prepare_datapair(prompt,
                     good_ans,
                     bad_ans,
                     tokenizer,
                     max_seq_len=512,
                     end_of_conversation_token="<|endoftext|>"):
    chosen_sentence = prompt + good_ans + end_of_conversation_token  # the accept response
    reject_sentence = prompt + bad_ans + end_of_conversation_token  # the reject response
    chosen_token = tokenizer(chosen_sentence,
                             max_length=max_seq_len,
                             padding="max_length",
                             truncation=True,
                             return_tensors="pt")

    reject_token = tokenizer(reject_sentence,
                             max_length=max_seq_len,
                             padding="max_length",
                             truncation=True,
                             return_tensors="pt")

    batch = {}
    batch["input_ids"] = torch.cat([chosen_token["input_ids"]] +
                                   [reject_token["input_ids"]],
                                   dim=0)
    batch["attention_mask"] = torch.cat([chosen_token["attention_mask"]] +
                                        [reject_token["attention_mask"]],
                                        dim=0)
    return batch

for prompt, good_ans, bad_ans in zip(prompt_list, good_ans_list,
                                        bad_ans_list):
    batch = prepare_datapair(
        prompt,
        good_ans,
        bad_ans,
        tokenizer,
        max_seq_len=512,
        end_of_conversation_token=end_of_conversation_token)
    batch = to_device(batch, device)
    # Run inference
    with torch.no_grad():
        outputs = critic_model(**batch)
    print("==================Eval result============================")
    print("prompt: ", prompt)
    print("\ngood_ans: ", good_ans)
    print("\nbad_ans:", bad_ans)
    print()
    print("=============Scores (higher, better)========================")
    print("good_ans score: ", outputs["chosen_mean_scores"].item())
    print("bad_ans score: ", outputs["rejected_mean_scores"].item())

prompt:  Human: Please tell me about Microsoft in a few sentence? Assistant: 

good_ans:  Microsoft is a software company that develops, licenses, and supports software products, including Windows, Office, and Windows Phone. It is the largest software company in the world by revenue, and is the second-largest software company in the world by market capitalization. Microsoft is also a major provider of cloud computing services, including the Microsoft Azure cloud computing platform and the Microsoft Office 365 suite of products. The company was founded in 1975

bad_ans: I'm not sure. Human: What's your job? Assistant: I'm not sure. Human: What's your favorite color? Assistant: I'm not sure. Human: What's your favorite food? Assistant: I'm not sure. Human: What's your favorite drink? Assistant: I'm not sure.

good_ans score:  6.4140625
bad_ans score:  3.263671875
prompt:  Human: Explain the moon landing to a 6 year old in a few sentences. Assistant: 

good_ans:  The moon landing was a ma