In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import nltk
from rouge_score import rouge_scorer
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### I: Loading the evaluation dataset
We will now load the provided evaluation dataset and add a helper function for preparing model inputs.

In [3]:
TEST_DATASET_PATH = "data/prompts.json"
with open(TEST_DATASET_PATH, "r") as f:
    test_dataset = json.load(f)
sample_question = test_dataset[5]
sample_question

{'guid': '202f9e3f-be4d-4b51-8789-f4eb0666690f',
 'question': 'When using linear regression, how do you help prevent numerical instabilities? (One or multiple answers)',
 'answer': ['add a regularization term', 'remove degenerate features'],
 'choices': ['reduce learning rate',
  'add a regularization term',
  'remove degenerate features',
  'add more features']}

In [4]:
def prepare_model_input(datapoint):
    # initialize the instruction and the question
    
    # if a multiple choice question, add the choices to the model input
    if "choices" not in datapoint or datapoint["choices"] == []:
        prepend = "Please answer the following question and justify your answer.\n"
        question_body = "Question: " + datapoint["question"]
    else:
        prepend = "Please answer the following multiple choice question by selecting one of the options and justify your answer.\n"
        question_body = "Question: " + datapoint["question"] + "\n" + "Options:\n" + "\n".join(datapoint["choices"])
    
    return prepend + question_body

prepare_model_input(sample_question)

'Please answer the following multiple choice question by selecting one of the options and justify your answer.\nQuestion: When using linear regression, how do you help prevent numerical instabilities? (One or multiple answers)\nOptions:\nreduce learning rate\nadd a regularization term\nremove degenerate features\nadd more features'

## III: Syntactic similarity scores
As mentioned in the project plan (*Milestone 1*), we will use the following syntactic similairity measures to perform qualitative evaluation our model's performance:
- BLEU score
- ROUGE score

Below, we define appropriate functions.

In [5]:
# adapted from https://stackoverflow.com/questions/32395880/calculate-bleu-score-in-python

def bleu(hypothesis, reference):
    hypothesis_tokens = [token.lower() for token in hypothesis.split()]
    reference_tokens = [token.lower() for token in reference.split()]
    return nltk.translate.bleu_score.sentence_bleu([reference_tokens], hypothesis_tokens)

bleu("Hey Tom, how are you doing?", "Hey Tom, how is it going?")

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


5.775353993361614e-78

In [6]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

def rouge(hypothesis, reference):
    return scorer.score(hypothesis, reference)['rouge1'].fmeasure

In [52]:
rouge("Hey Tom, how are you doing?", "Hey Tom, what's up?")

0.3636363636363636

In [53]:
rouge("Hey Tom, how are you doing?", "Hey Tom, how is it going?")

0.5

### IV: Reward model scoring
We will also use the reward model to score the assistant model outputs.

In [7]:
# Load the pre-trained reward model
REWARD_MODEL_PATH = "models/reward_model"
reward_model = GPT2ForSequenceClassification.from_pretrained(REWARD_MODEL_PATH, num_labels=1).to(device)
reward_model.config.pad_token_id = reward_model.config.eos_token_id
reward_model.eval()

reward_tokenizer = GPT2Tokenizer.from_pretrained(REWARD_MODEL_PATH)

In [8]:
# calculate reward function score for a given text input 
def calculate_reward(text):
    inputs = reward_tokenizer(text, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    reward = reward_model(**inputs).logits
    return reward.item()

calculate_reward("Hey Tom, how are you doing?")

-2.1174707412719727

### Generate ground truth examples
Similarly to what we proposed in M2, we generate ground truth answers from the provided answers using ChatGPT

In [9]:
import gpt_wrapper
from tqdm import tqdm
from gpt_wrapper import APIException
gpt_wrapper.api_key = json.load(open("./secrets/api_key.json", "r"))
from gpt_wrapper.chat import Chat

# what should we do as system input?
def generate_positive_sample(datapoint, print_chatgpt_input=False):
    # annotation with ChatGPT
    chat = gpt_wrapper.chat.Chat.create(name="Annotator")

    # initialize the instruction and the question
    instruction = "Given the question"
    question_body = "Question: " + datapoint["question"]
    
    # if a multiple choice question, add the choices to the model input
    if "choices" in datapoint and datapoint["choices"] is not None and datapoint["choices"] != []:
        question_body += "\nChoices:\n" + "\n".join(datapoint["choices"])
        instruction += ", the choices, "
    
    # add the answer to the model input
    if isinstance(datapoint["answer"], list):
        # this is the case for multiple choice question with multiple correct options - the answer can be an array of correct options
        answers = datapoint["answer"]
        
        # some solutions even have a nested list as the answer, see "sol_id": 2296267
        answers = [" ".join(str(answer)) if isinstance(answer, list) else str(answer) for answer in answers]
        
        question_body += "\nAnswers:\n" + "\n".join(answers)
        instruction += "and the correct answers, repeat the correct answers and produce a justification why they are correct."
    else:
        # this is the case for a question with a single answer
        question_body += "\nAnswer:\n" + str(datapoint["answer"])
        instruction += "and the correct answer, repeat the answer and produce a justification why the given answer is correct."

    # if explanation provided, append it to the model input
    if "explanation" in datapoint and datapoint["explanation"] is not None and datapoint["explanation"] != "":
        question_body += "\nExplanation:\n" + datapoint["explanation"]
        instruction += " Base your justification on the provided explanation."
    
    # for debugging purposes
    if print_chatgpt_input:
        print("Question body:\n", question_body, "\n\n", "Instruction:\n", instruction)

    try:
        output = chat.ask(content=question_body, instruction=instruction, model_args={"max_tokens": 1024}).content.strip()
    except APIException as exception:
        exception_msg = exception.args[0]
        print("Error for id: ", datapoint["guid"], ": ", exception_msg)
        output = ""

    return output

In [11]:
# generate ground truth for each question
ground_truths = {}
for datapoint in tqdm(test_dataset):
    ground_truths[datapoint["guid"]] = generate_positive_sample(datapoint)

with open("data/promts_ground_truth.json", "w") as f:
    json.dump(ground_truths, f)1

 10%|█         | 10/100 [00:26<03:02,  2.03s/it]

Error for id:  92251192-94ec-4d07-a94c-34775db00d70 :  Server Error


 58%|█████▊    | 58/100 [03:01<01:20,  1.93s/it]

OpenAI rate limit hit.
Retrying in 10 seconds...


100%|██████████| 100/100 [05:37<00:00,  3.37s/it]


### Compare results against ground truth
We use `gen_script_mhy.py` to generate answers to the test questions using different models (gpt-2-medium without fine tuning, gpt-2-medium fine tuned on original data, gpt-2-medium fine tuned on augmented data and the model trained with ppo). Then, we evaluate the results:
- we compare bleu & rouge similarity scores with ground truth and compare the results of different models
- we calculate reward model scores
- we perform qualitative evaluation 

In [43]:
def load_generated_answers(model_name):
    answers_path = f"answers_mhy_{model_name}.jsonl"
    answers = {}
    with open(answers_path, mode='r') as f:
        for line in f.readlines():
            line_ = line.strip()
            data = json.loads(line_.strip())
            answers[data["guid"]] = data["answer"]
    return answers

gpt2_pretrained_answers = load_generated_answers("pretrained_gpt2_medium")
sft_answers = load_generated_answers("sft_model_gpt2_medium_with_original_data")
sft_aug_answers = load_generated_answers("sft_model_gpt2_medium_with_augmentation_data")
ppo_answers = load_generated_answers("ppo_model")

In [44]:
import pandas as pd

# calculate rouge and bleu scores
def bleu_scores(answers):
    return pd.Series([bleu(answers[guid], ground_truths[guid]) for guid in ground_truths])

def rouge_scores(answers):
    return pd.Series([rouge(answers[guid], ground_truths[guid]) for guid in ground_truths])


ppo_bleu = bleu_scores(ppo_answers)
sft_bleu = bleu_scores(sft_answers)
sft_aug_bleu = bleu_scores(sft_aug_answers)
gpt2_pretrained_bleu = bleu_scores(gpt2_pretrained_answers)

ppo_rouge = rouge_scores(ppo_answers)
sft_rouge = rouge_scores(sft_answers)
sft_aug_rouge = rouge_scores(sft_aug_answers)
gpt2_pretrained_rouge = rouge_scores(gpt2_pretrained_answers)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [45]:
gpt2_pretrained_bleu.mean(), sft_bleu.mean(), sft_aug_bleu.mean(), ppo_bleu.mean()

(0.001284177264116556,
 0.004595554529084039,
 0.012106199821505893,
 0.007869554808111903)

In [46]:
gpt2_pretrained_rouge.mean(), sft_rouge.mean(), sft_aug_rouge.mean(), ppo_rouge.mean()

(0.13335171461658504,
 0.17377090792391786,
 0.2595082035655366,
 0.29363926572496685)

In [39]:
def calculate_reward_scores(answers):
    return pd.Series(answers.values()).progress_apply(calculate_reward)

tqdm.pandas()
gpt2_pretrained_reward_scores = calculate_reward_scores(gpt2_pretrained_answers)
sft_reward_scores = calculate_reward_scores(sft_answers)
sft_aug_reward_scores = calculate_reward_scores(sft_aug_answers)
ppo_reward_scores = calculate_reward_scores(ppo_answers)

100%|██████████| 97/97 [01:48<00:00,  1.12s/it]


In [48]:
gpt2_pretrained_reward_scores.mean(), sft_reward_scores.mean(), sft_aug_reward_scores.mean(), ppo_reward_scores.mean()

(-0.6691875358678631,
 -0.7146618721872261,
 -0.08533569379258402,
 0.15368630621851104)