### I: Loading the models
First we will load the pretrained models:
- the generative assistant model (trained in *Milestone 3*)
- the reward model (trained in *Milestone 2*)

In [4]:
import sys
!{sys.executable} -m pip install rouge-score

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting rouge-score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting absl-py
  Using cached absl_py-1.4.0-py3-none-any.whl (126 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=286ddd64b1d143dd11bffe5b451b7d7724d5a88c9ab3c5b7460945829373ba7c
  Stored in directory: /Users/mstyczen/Library/Caches/pip/wheels/b0/3f/ac/cc3bc304f50c77ef38d79d8e4e2684313de39af543cb4eb3da
Successfully built rouge-score
Installing collected packages: absl-py, rouge-score
[33m  DEPRECATION: Configuring install

In [5]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_cosine_schedule_with_warmup, GPT2ForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import nltk
from rouge_score import rouge_scorer
import json
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# load the pre-trained assistant model
ASSISTANT_MODEL_PATH = "./models/sft_model/sft_model_gpt2_with_original_data"
assistant_tokenizer = GPT2Tokenizer.from_pretrained(ASSISTANT_MODEL_PATH)
assistant_model = GPT2LMHeadModel.from_pretrained(ASSISTANT_MODEL_PATH).to(device)
assistant_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [7]:
# Load the pre-trained reward model
REWARD_MODEL_PATH = "models/reward_model"
reward_model = GPT2ForSequenceClassification.from_pretrained(REWARD_MODEL_PATH, num_labels=1).to(device)
reward_model.eval()
reward_model.config.pad_token_id = reward_model.config.eos_token_id

In [21]:
MAX_LEN = 100
NUM_BEAMS =  5
NUM_RETURNED_SEQUENCES = 1

def assistant_query(question):
    # Define the input question
    input_ids = assistant_tokenizer.encode(question, return_tensors='pt').to(device)

    # Generate the attention mask
    attention_mask = torch.ones_like(input_ids).to(device)

    # Generate the answer
    output = assistant_model.generate(input_ids=input_ids, attention_mask = attention_mask, max_length=MAX_LEN, num_beams=NUM_BEAMS, no_repeat_ngram_size=2, num_return_sequences=NUM_RETURNED_SEQUENCES, early_stopping=True)

    # Decode and print the response
    return assistant_tokenizer.decode(output[0], skip_special_tokens=True)

assistant_query("What is 1+1?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'What is 1+1?'

### II: Loading the evaluation dataset
We will now load the provided evaluation dataset and add a helper function for preparing model inputs.

In [8]:
TEST_DATASET_PATH = "data/prompts.json"
with open(TEST_DATASET_PATH, "r") as f:
    test_dataset = json.load(f)
sample_question = test_dataset[5]
sample_question

{'guid': '202f9e3f-be4d-4b51-8789-f4eb0666690f',
 'question': 'When using linear regression, how do you help prevent numerical instabilities? (One or multiple answers)',
 'answer': ['add a regularization term', 'remove degenerate features'],
 'choices': ['reduce learning rate',
  'add a regularization term',
  'remove degenerate features',
  'add more features']}

In [9]:
def prepare_model_input(datapoint):
    # initialize the instruction and the question
    
    # if a multiple choice question, add the choices to the model input
    if "choices" not in datapoint or datapoint["choices"] == []:
        prepend = "Please answer the following question and justify your answer.\n"
        question_body = "Question: " + datapoint["question"]
    else:
        prepend = "Please answer the following multiple choice question by selecting one of the options and justify your answer.\n"
        question_body = "Question: " + datapoint["question"] + "\n" + "Options:\n" + "\n".join(datapoint["choices"])
    
    return prepend + question_body

prepare_model_input(sample_question)

'Please answer the following multiple choice question by selecting one of the options and justify your answer.\nQuestion: When using linear regression, how do you help prevent numerical instabilities? (One or multiple answers)\nOptions:\nreduce learning rate\nadd a regularization term\nremove degenerate features\nadd more features'

## III: Syntactic similarity scores
As mentioned in the project plan (*Milestone 1*), we will use the following syntactic similairity measures to perform qualitative evaluation our model's performance:
- BLEU score
- ROUGE score

Below, we define appropriate functions.

In [14]:
# adapted from https://stackoverflow.com/questions/32395880/calculate-bleu-score-in-python

def bleu(hypothesis, reference):
    hypothesis_tokens = [token.lower() for token in hypothesis.split()]
    reference_tokens = [token.lower() for token in reference.split()]
    return nltk.translate.bleu_score.sentence_bleu([reference_tokens], hypothesis_tokens)

bleu("Hey Tom, how are you doing?", "Hey Tom, how is it going?")

5.775353993361614e-78

In [37]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

def rouge(hypothesis, reference):
    return scorer.score(hypothesis, reference)['rouge1']

In [38]:
rouge("Hey Tom, how are you doing?", "Hey Tom, what's up?")

Score(precision=0.4, recall=0.3333333333333333, fmeasure=0.3636363636363636)

In [39]:
rouge("Hey Tom, how are you doing?", "Hey Tom, how is it going?")

Score(precision=0.5, recall=0.5, fmeasure=0.5)

### IV: Reward model scoring
We will also use the reward model to score the assistant model outputs.

In [None]:
# TODO:
def calculate_reward(output):
    pass