In [1]:
import os

import regex as re
import random
import numpy as np
import torch


seed = 42


"""Set seed for reproducibility."""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [2]:
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import gensim
import torch
import math
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, AutoConfig, AutoModelWithLMHead, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments

### Load Data

In [3]:
import json

def load_data(path):
    with open(path, 'rb') as file:
        data = json.load(file)
    return data

In [4]:
train_data, val_data = load_data('./data/train_data.json'), load_data('./data/val_data.json')

In [5]:
print(len(train_data), len(val_data))
train_data[0]

2882 321


{'sol_id': 1693719,
 'question': 'Rods (peak response at $\\lambda=507 \\mathrm{~nm}$ ) and cones (peak response at $555 \\mathrm{~nm}$ ) are the photosensitive cells in human eye. Although\n\nrods are more sensitive, they cannot register colors (unlike cones).\n\nGiven that the sensitivity of cone cells is $1 / 220$ of the rod cells, find the threshold values for cone cells.',
 'choices': None,
 'y_plus_answer': 'The peak response values of rods and cones vary depending on the wavelength of light. \n\nRods, which are responsible for vision in low light conditions, have a peak response at around 498 nanometers (blue-green light).\n\nCones, which are responsible for color vision and visual acuity in bright light conditions, have three subtypes that respond to different wavelengths of light. The peak response values for these cones are:\n\n- S-cones (short-wavelength cones) have a peak response at around 420 nanometers (blue-violet light).\n- M-cones (medium-wavelength cones) have a peak

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

### Functions for saving and loading models.

In [7]:
from transformers import WEIGHTS_NAME, CONFIG_NAME, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer
import os

def save_model(name, model, tokenizer, dir_ = 'models/'):

    output_dir = "{}/{}/".format(dir_, name)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    model_to_save = model.module if hasattr(model, 'module') else model
  
    output_model_file = output_dir +  WEIGHTS_NAME
    output_config_file = output_dir + CONFIG_NAME

    torch.save(model_to_save.state_dict(), output_model_file)
    model_to_save.config.to_json_file(output_config_file)
    tokenizer.save_pretrained(output_dir)

def load_model(name, dir_ = 'models/'):
    output_dir = "{}/{}/".format(dir_, name)
    model = AutoModelForSeq2SeqLM.from_pretrained(output_dir)
    tokenizer = AutoTokenizer.from_pretrained(output_dir)
    return model, tokenizer

### Initialize the main model and tokenizer using T5-small

In [31]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_name = "t5-small" 

sft_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
sft_tokenizer = AutoTokenizer.from_pretrained(model_name)

### Upload the Reward Model

In [17]:
from reward_model import (RewardModelConfig, RewardModel)
from transformers import (AutoConfig, AutoModel, AutoTokenizer)

path = "./models/reward_model/models_hf_full/checkpoint-4785/"

AutoConfig.register('RewardModel', RewardModelConfig)
AutoModel.register(RewardModelConfig, RewardModel)
reward_tokenizer = AutoTokenizer.from_pretrained(path)

config = AutoConfig.from_pretrained(path)
reward_model = AutoModel.from_pretrained(path, config=config).to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Create a Dataset object based on our data

In [77]:
class QuestionAnswerDataset(Dataset):
    """
    Creating a custom dataset for reading the dataset and
    loading it into the dataloader to pass it to the
    neural network for finetuning the model

    """

    def __init__(
        self, data, tokenizer, max_seq_len, answer_name = "y_plus_answer"):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            question_max_len (int): Max length of source text
            target_len (int): Max length of target text
            source_text (str): column name of source text
            target_text (str): column name of target text
        """
        self.tokenizer = tokenizer
        self.tokenized_samples = []
        self.max_seq_len = max_seq_len
        self.answer_name = answer_name

        for sample in data:
            tokenized_sample = self._tokenize(sample)
            if len(tokenized_sample['input_ids']) <= self.max_seq_len:
                self.tokenized_samples.append(tokenized_sample)


    def __len__(self):
        """returns the length of dataframe"""

        return len(self.tokenized_samples)

    def __getitem__(self, index):
        """return the input ids, attention masks and target ids"""

        return {
            "input_ids": self.tokenized_samples[index]['input_ids'].squeeze(),
            "attention_mask": self.tokenized_samples[index]['attention_mask'].squeeze(),
            "labels": self.tokenized_samples[index]['labels'].squeeze()
        }

    def _tokenize(self, sample):
        question = "Answer to the question: " + sample["question"] + '\n'
        if "choices" in sample and sample["choices"] is not None:
            question += "Answer options: " + " ".join(f"{i}) {choice}" for i, choice in enumerate(sample["choices"]))

        model_inputs = self.tokenizer(question, padding='max_length', truncation=True,
                                      max_length=self.max_seq_len, return_tensors="pt")

        labels = self.tokenizer(text_target = sample[self.answer_name], padding='max_length',
                                truncation=True, max_length=self.max_seq_len, return_tensors="pt")["input_ids"]

        labels[labels == self.tokenizer.pad_token_id] = -100

        model_inputs["labels"] = labels

        return model_inputs

In [78]:
MAX_SEQ_LENGTH= 512

train_dataset = QuestionAnswerDataset(train_data, sft_tokenizer, MAX_SEQ_LENGTH)
val_dataset = QuestionAnswerDataset(val_data, sft_tokenizer, MAX_SEQ_LENGTH)

In [36]:
train_dataset[0]

{'input_ids': tensor([11801,    12,     8,   822,    10,  8222,     7,    41, 14661,  1773,
            44,  1514,     2,    40,   265,   115,    26,     9,  2423,  1752,
           940,     3,     2,  3357,   107,    52,    51,     2,    29,    51,
             2,  3229,     3,    61,    11, 14075,     7,    41, 14661,  1773,
            44,  6422,  3769,     3,     2,  3357,   107,    52,    51,     2,
            29,    51,     2,  3229,     3,    61,    33,     8,  1202, 22118,
          2640,    16,   936,  1580,     5,  1875,  6102,     7,    33,    72,
          6280,     6,    79,  1178,  3691,  2602,    41,   202,  2376, 14075,
             7,   137,  9246,    24,     8,     3, 13398,    13, 14075,  2640,
            19,  1970,     3,    87,   204,  1755,  3229,    13,     8,  6102,
          2640,     6,   253,     8, 12709,  2620,    21, 14075,  2640,     5,
             1,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

### Supervised fine-tuning (SFT)

We fine-tune T5-small on our labeler demonstrations using supervised learning. We trained for 10 epochs with batch size 4. 

In [19]:
dir_ = 'models/SFT'

In [22]:
batch_size = 4

args = Seq2SeqTrainingArguments(
    dir_ + model_name,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
)

In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer=sft_tokenizer, model=sft_model)

trainer = Seq2SeqTrainer(
    sft_model,
    args,
    train_dataset=train_dataset, 
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=sft_tokenizer)

In [24]:
trainer.train()
save_model(model_name, trainer.model, trainer.tokenizer, dir_)

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,2.782319
2,No log,2.661401
3,3.218300,2.60144
4,3.218300,2.559311
5,2.816100,2.533958
6,2.816100,2.512206
7,2.734100,2.500043
8,2.734100,2.490313
9,2.688600,2.48602
10,2.688600,2.484408


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

Let's check the performance of the SFT-model on a sample of questions.

In [34]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

sft__model, sft_tokenizer = load_model('SFT/t5-small')

In [72]:
def generate_answer(model, tokenizer, batch, do_sampling=False):
    if not do_sampling:
        beam_output = rlhf_trainer.model.generate(
                        input_ids=batch['input_ids'].to(rlhf_trainer.model.device),
                        attention_mask=batch['attention_mask'].to(rlhf_trainer.model.device),
                        num_beams = 5,
                        no_repeat_ngram_size = 2, 
                        early_stopping = True,
                        max_length=128)
    else:
        beam_output = rlhf_trainer.model.generate(
                        input_ids=batch['input_ids'].to(rlhf_trainer.model.device),
                        attention_mask=batch['attention_mask'].to(rlhf_trainer.model.device),
                        do_sample = True, 
                        no_repeat_ngram_size = 2, 
                        #top_k = 50, 
                        top_p = 0.85,
                        max_length=128)

    for i, output in enumerate(beam_output):
        print("Question: {}\n".format(i) + 100 * '=')
        print(rlhf_trainer.tokenizer.decode(batch['input_ids'][i], skip_special_tokens=True))
        print(100*'-')
        print("Output: {}\n".format(i))
        print(rlhf_trainer.tokenizer.decode(beam_output[i], skip_special_tokens=True))
        print(100 * '=')

In [43]:
val_loader = DataLoader(val_dataset, batch_size=5, shuffle=True)
it = iter(val_loader)
batch = next(it)

BeamSearch Strategy

In [64]:
generate_answer(sft_model, sft_tokenizer, batch)

Question: 0
Answer to the question: Would you describe this learning procedure as reinforcement, supervised or unsupervised learning? Answer options: 0) It is supervised, since we explicitely provide the correct weights to initialize the network. 1) It is unsupervised since the network learns implicit associations present in the input without any additional teaching signal. 2) It is reinforcement learning since only weight updates only occur when a pattern is retrieved correctly.
----------------------------------------------------------------------------------------------------
Output: 0

The question asks about this learning procedure as reinforcement, supervised or unsupervised learning. Answer: 0: "It's not the same thing as a physical learning technique, since we explicitly explicitly provide the correct weights to initialize the network. This is because we explicitely provides the right to Initialize": False. The answer is false. Therefore, the answer to the question is true.
Que

In [73]:
generate_answer(sft_model, sft_tokenizer, batch, True)

Question: 0
Answer to the question: Would you describe this learning procedure as reinforcement, supervised or unsupervised learning? Answer options: 0) It is supervised, since we explicitely provide the correct weights to initialize the network. 1) It is unsupervised since the network learns implicit associations present in the input without any additional teaching signal. 2) It is reinforcement learning since only weight updates only occur when a pattern is retrieved correctly.
----------------------------------------------------------------------------------------------------
Output: 0

The correct answer is False. This choice is false, but it is only supervised because the weights to initialize the network are correct. 3) It is coached, since the model of the input is not correctly set by the appropriate weight to start it.
Question: 1
Answer to the question: Suppose we use the Simplex method to solve the following linear program: beginalign* textbfmaximize hspace0.8cm & hspace0.4c

### Reinforcement learning (RL). 

We fine-tuned the SFT model on our environment using REINFORCE: Basics. We optimized 
$L = L_{MLE} + αL_{RL}$, where $L_{MLE}$ is MLE for groundth true answer to the question and $L_{RL} = -R(Y_{generated})L_{MLE}(Y_{generated})$

In [13]:
class RLHFTrainer(Seq2SeqTrainer):
    
    def add_reward_model(self, reward_model, reward_tokenizer, rl_alpha):
        self.reward_model = reward_model
        self.reward_tokenizer = reward_tokenizer
        self.rl_alpha = rl_alpha
        
    def compute_loss(self, model, inputs, return_outputs=False):
        # Compute MLE loss. 
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits') # (batch_size, max_length, vocab_len)
        criterion = nn.CrossEntropyLoss()
        loss = criterion(logits.permute(0, 2, 1), labels) 
        
        # Compute RL loss. 
        gen_outputs = model.module.generate(
            input_ids = inputs['input_ids'].to(self.model.device),
            attention_mask = inputs['attention_mask'].to(self.model.device), 
            do_sample=False,
            num_beams=4,
            max_length=MAX_SEQ_LENGTH,
            output_scores=True,
            return_dict_in_generate=True)

        batch_size, seq_length = gen_outputs.sequences[:, 1:].shape # skip BOS token
        pads = -100*torch.ones((batch_size, MAX_SEQ_LENGTH - seq_length), dtype=torch.long).to(self.model.device)
        gen_answers_as_labels = torch.cat((gen_outputs.sequences[:, 1:], pads), dim=1)

        gen_input_info = {
                'input_ids': inputs['input_ids'].to(self.model.device),
                'attention_mask': inputs['attention_mask'].to(self.model.device), 
                'labels': gen_answers_as_labels.to(self.model.device),
                'return_dict': True}
        
        gen_answer_outputs = model(**gen_input_info)
        gen_answer_logits = gen_answer_outputs.get('logits')
        rl_criterion = nn.CrossEntropyLoss(reduce=False)
        rl_mle = torch.mean(rl_criterion(gen_answer_logits.permute(0, 2, 1), gen_answers_as_labels), dim=-1)
        
        #Compute rewards
        questions = self.tokenizer.batch_decode(inputs['input_ids'], skip_special_tokens=True)
        answers = self.tokenizer.batch_decode(gen_outputs.sequences, skip_special_tokens=True)
        question_answer_pairs = list(map(lambda i: "[CLS] " + questions[i] + " [SEP] " + answers[i], np.arange(batch_size)))
        reward_inputs = self.reward_tokenizer.batch_encode_plus(question_answer_pairs, padding='max_length', 
                      truncation=True, max_length=MAX_SEQ_LENGTH, return_tensors="pt")['input_ids']
        with torch.no_grad():
            rewards = 5 + self.reward_model(reward_inputs.to(self.reward_model.device)).reshape(-1).to(self.model.device)   
        
        # Mix RL loss with MLE
        loss = loss + self.rl_alpha*torch.mean(rl_mle*rewards)

        return (loss, outputs) if return_outputs else loss

In [14]:
torch.cuda.empty_cache()

In [18]:
dir_ = 'models/RLHF'

batch_size = 2

args = Seq2SeqTrainingArguments(
    dir_,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
)

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer=sft_tokenizer, model=sft_model.to(device))

rlhf_trainer = RLHFTrainer(
    sft_model.to(device),
    args,
    train_dataset=train_dataset, 
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=sft_tokenizer)

In [20]:
rlhf_trainer.add_reward_model(reward_model, reward_tokenizer, 0.1)

In [21]:
rlhf_trainer.train()
save_model('t5_small', rlhf_trainer.model, rlhf_trainer.tokenizer, dir_)

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,2.471221
2,3.140700,2.420626
3,2.726700,2.391729
4,2.663500,2.379277
5,2.635100,2.374271


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

In [47]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

rlhf_model, rlhf_tokenizer = load_model('RLHF/t5_small')

In [75]:
generate_answer(rlhf_model, rlhf_tokenizer, batch)

Question: 0
Answer to the question: Would you describe this learning procedure as reinforcement, supervised or unsupervised learning? Answer options: 0) It is supervised, since we explicitely provide the correct weights to initialize the network. 1) It is unsupervised since the network learns implicit associations present in the input without any additional teaching signal. 2) It is reinforcement learning since only weight updates only occur when a pattern is retrieved correctly.
----------------------------------------------------------------------------------------------------
Output: 0

The question asks about this learning procedure as reinforcement, supervised or unsupervised learning. The answer answer is 0: It is the reinforcement learning, since we explicitly provide the correct weights to initialize the network. Answer 2: "Imployment learning": False. This means that a network learns implicit associations present in the input without any additional teaching signal. Therefore, 

In [76]:
generate_answer(rlhf_model, rlhf_tokenizer, batch, True)

Question: 0
Answer to the question: Would you describe this learning procedure as reinforcement, supervised or unsupervised learning? Answer options: 0) It is supervised, since we explicitely provide the correct weights to initialize the network. 1) It is unsupervised since the network learns implicit associations present in the input without any additional teaching signal. 2) It is reinforcement learning since only weight updates only occur when a pattern is retrieved correctly.
----------------------------------------------------------------------------------------------------
Output: 0

Answer #2: It is not supervised, since we explicitly provide the correct weights to initialize the network. The question asks about reinforcement, sag, and unsupervised learning. Answer #3: "I am a reinforcement learning system": True. It contains weight-messages, not words, in which translates the information about it. In this case, the number of weight changes can be measured in ten seconds. I am t

### Evaluation using a Reward Model

Function for generating answers to the questions asked

In [15]:
def generate_answers(model, tokenizer, questions):
    MAX_SEQ_LENGTH= 512

    generated_answers = []

    for sample in questions:
        question = "Answer to the question: " + sample["question"] + '\n'
        if "choices" in sample and sample["choices"] is not None:
            question += "Answer options: " + " ".join("{}) {}".format(i, choice) for i, choice in enumerate(sample["choices"]))

        model_inputs = tokenizer(question, padding='max_length', truncation=True,
                                      max_length=MAX_SEQ_LENGTH, return_tensors="pt")

        beam_output = model.generate(
                        input_ids=model_inputs['input_ids'].to(model.device),
                        attention_mask=model_inputs['attention_mask'].to(model.device),
                        num_beams = 5,
                        no_repeat_ngram_size = 2, 
                        early_stopping = True,
                        max_length=256)

        sample['generated_answer'] = tokenizer.batch_decode(beam_output, skip_special_tokens=True)[0]
        generated_answers.append(sample)
        
    return generated_answers

Function for preparing input data for the reward model

In [9]:
def format_reward_input(data):
    reward_input = []
    for sample in data:
        question = sample["question"]
        if "choices" in sample and sample["choices"] is not None:
            question += " ".join(f"{i}) {choice}" for i, choice in enumerate(sample["choices"]))
        input_ids = reward_tokenizer.encode("[CLS] " + question + " [SEP] " + sample['generated_answer'], 
                        padding='max_length', truncation=True, max_length=512, return_tensors="pt")    
        reward_input.append(input_ids)
        
    return torch.cat(reward_input)

Function for calculating the average reward for generated responses

In [13]:
def calculate_avg_reward(reward_input, model_name='SFT'):
    batch_size = 4
    rewards = []
    for i in range(0, len(reward_input), batch_size):
        batch = reward_input[i:i+batch_size].to(reward_model.device)
        rewards += list(reward_model(batch).detach().cpu().numpy())
    print('Model: ', model_name)
    print('Number of samples: ', len(rewards))
    print('Mean Reward: ', np.mean(rewards))

In [11]:
questions = load_data('prompts.json')

### SFT model

In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

sft_model, sft_tokenizer = load_model('SFT/t5-small')

In [18]:
generated_answers = generate_answers(sft_model, sft_tokenizer, questions)
reward_input = format_reward_input(generated_answers)
calculate_avg_reward(reward_input, model_name='SFT')

Model:  SFT
Number of samples:  100
Mean Reward:  1.4191724


### Final RLHF model

In [19]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

rlhf_model, rlhf_tokenizer = load_model('RLHF/t5_small')

In [20]:
generated_answers = generate_answers(rlhf_model, rlhf_tokenizer, questions)
reward_input = format_reward_input(generated_answers)
calculate_avg_reward(reward_input, model_name='RLHF')

Model:  RLHF
Number of samples:  100
Mean Reward:  1.8264363
