In [44]:
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import nltk
from rouge_score import rouge_scorer
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### I: Loading the evaluation dataset
We will now load the provided evaluation dataset and add a helper function for preparing model inputs.

In [64]:
TEST_DATASET_PATH = "data/prompts.json"
with open(TEST_DATASET_PATH, "r") as f:
    test_dataset = json.load(f)
sample_question = test_dataset[5]
sample_question

{'guid': '202f9e3f-be4d-4b51-8789-f4eb0666690f',
 'question': 'When using linear regression, how do you help prevent numerical instabilities? (One or multiple answers)',
 'answer': ['add a regularization term', 'remove degenerate features'],
 'choices': ['reduce learning rate',
  'add a regularization term',
  'remove degenerate features',
  'add more features']}

In [47]:
def prepare_model_input(datapoint):
    # initialize the instruction and the question
    
    # if a multiple choice question, add the choices to the model input
    if "choices" not in datapoint or datapoint["choices"] == []:
        prepend = "Please answer the following question and justify your answer.\n"
        question_body = "Question: " + datapoint["question"]
    else:
        prepend = "Please answer the following multiple choice question by selecting one of the options and justify your answer.\n"
        question_body = "Question: " + datapoint["question"] + "\n" + "Options:\n" + "\n".join(datapoint["choices"])
    
    return prepend + question_body

prepare_model_input(sample_question)

'Please answer the following multiple choice question by selecting one of the options and justify your answer.\nQuestion: When using linear regression, how do you help prevent numerical instabilities? (One or multiple answers)\nOptions:\nreduce learning rate\nadd a regularization term\nremove degenerate features\nadd more features'

## III: Syntactic similarity scores
As mentioned in the project plan (*Milestone 1*), we will use the following syntactic similairity measures to perform qualitative evaluation our model's performance:
- BLEU score
- ROUGE score

Below, we define appropriate functions.

In [14]:
# adapted from https://stackoverflow.com/questions/32395880/calculate-bleu-score-in-python

def bleu(hypothesis, reference):
    hypothesis_tokens = [token.lower() for token in hypothesis.split()]
    reference_tokens = [token.lower() for token in reference.split()]
    return nltk.translate.bleu_score.sentence_bleu([reference_tokens], hypothesis_tokens)

bleu("Hey Tom, how are you doing?", "Hey Tom, how is it going?")

5.775353993361614e-78

In [51]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

def rouge(hypothesis, reference):
    return scorer.score(hypothesis, reference)['rouge1'].fmeasure

In [52]:
rouge("Hey Tom, how are you doing?", "Hey Tom, what's up?")

0.3636363636363636

In [53]:
rouge("Hey Tom, how are you doing?", "Hey Tom, how is it going?")

0.5

### IV: Reward model scoring
We will also use the reward model to score the assistant model outputs.

In [65]:
# Load the pre-trained reward model
REWARD_MODEL_PATH = "models/reward_model"
reward_model = GPT2ForSequenceClassification.from_pretrained(REWARD_MODEL_PATH, num_labels=1).to(device)
reward_model.config.pad_token_id = reward_model.config.eos_token_id
reward_model.eval()

reward_tokenizer = GPT2Tokenizer.from_pretrained(REWARD_MODEL_PATH)

In [66]:
# calculate reward function score for a given text input 
def calculate_reward(text):
    inputs = reward_tokenizer(text, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    reward = reward_model(**inputs).logits
    return reward.item()

calculate_reward("Hey Tom, how are you doing?")

-2.1174707412719727