# GPT evaluation on responses generated by 2 models

In [1]:

from typing import Dict, List, Optional, Iterator, Callable, Union, Tuple
from tqdm import trange
import numpy as np
import random
import os
import json
from openai import OpenAI
import anthropic
import sys 
sys.path.append("../..")
from tqdm import tqdm
import time

EVALUATOR = 'claude' #'gpt'# 

In [2]:
def gpt_evaluation_prompt_generator(prompt, response_A, response_B):
    prompt = f'Given the history of multi-round chat, which response is more helpful?\n\n\
History:\n{prompt}\n\n\
Response A: {response_A}\n\n\
Response B: {response_B}\n\n\
FIRST provide a one-sentence comparison of the two responses and explain which you feel is more helpful. \
SECOND, on a new line, state only "A" or "B" to indicate which response is more helpful. \
Your response should use the format:\n\n\
Comparison: <one-sentence comparison and explanation>\n\
More helpful: <"A" or "B">'
    return prompt

def GPT_completion(msg_to_gpt, temp=0.7):
    if EVALUATOR == 'gpt':
        sys_msg = [{"role": 'system', "content": 'You are a helpful assistant.'}]
        GPT_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
        response = GPT_client.chat.completions.create(
                            model = 'gpt-3.5-turbo',
                            messages = sys_msg + [{'role': 'user', 'content': msg_to_gpt}],
                            temperature=temp,
                            top_p = 1,         
                            logprobs = False,
                            )
        gpt_response = response.choices[0].message.content    
        return gpt_response
    elif EVALUATOR == 'claude':
        sys_msg = [{"role": 'system', "content": 'You are a helpful assistant.'}]
        Claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
        response = Claude_client.messages.create(
                            model = 'claude-3-haiku-20240307',
                            system = 'You are a helpful assistant.',
                            messages = [{'role': 'user', 'content': msg_to_gpt}],
                            temperature=temp,
                            max_tokens=2048,
                            top_p = 1,
                            )
        claude_response = response.content[0].text   
        return claude_response

In [3]:
print(gpt_evaluation_prompt_generator(prompt='hello world', response_A='test A', response_B='test B'))

Given the history of multi-round chat, which response is more helpful?

History:
hello world

Response A: test A

Response B: test B

FIRST provide a one-sentence comparison of the two responses and explain which you feel is more helpful. SECOND, on a new line, state only "A" or "B" to indicate which response is more helpful. Your response should use the format:

Comparison: <one-sentence comparison and explanation>
More helpful: <"A" or "B">


## Extract responses from the json file

In [4]:
PATH = 'E://P5_5_SFT_dynamics//finetuning_dynamics//data//helpful-base//methods_compare'

In [5]:
def extrac_dict(exp_name):
    response_dict = {"prompt":[], "response":[]}
    with open(os.path.join(PATH, exp_name, 'prob_test_gen_response.jsonl'), 'r') as f:
        for line in f:
            prompt = json.loads(line)['prompt'].strip('\n')
            tmp_response = json.loads(line)['response'].strip('\n')
            if prompt in tmp_response:
                response = tmp_response.split(prompt)[1].strip(' ')
            else:
                response = tmp_response.split('Assistant: ')[1]
            response_dict["prompt"].append(prompt)
            response_dict["response"].append(response)
    return response_dict

In [6]:
MODEL_A = 'extend_dpo_qwen18_ep4' #'extend_dpo_qwen18' #'extend_sft_qwen18' #
MODEL_B = 'extend_dpo_qwen18_ep2' # 'baseline_dpo_qwen18' # 'baseline_sft_qwen18'
resA_all = extrac_dict(exp_name=MODEL_A)
resB_all = extrac_dict(exp_name=MODEL_B)

## Feed the response to GPT one-by-one

In [7]:
LEN = len(resA_all['prompt'])
eval_log_file = f"{MODEL_A}_vs_{MODEL_B}_{EVALUATOR}.txt"
with open(os.path.join(PATH, 'results_logs', eval_log_file), 'a') as f:
    f.write(f"####### We are doing an evaluation between [{MODEL_A}] and [{MODEL_B}] #######\n")
    
A_win_cnt = 0
B_win_cnt = 0
all_cnt = 0
with open(os.path.join(PATH, 'results_logs', eval_log_file), 'a', encoding='utf-8') as f:
    for i in tqdm(range(LEN)):
        assert resA_all['prompt'][i] == resB_all['prompt'][i]
        prompt = resA_all['prompt'][i]
        resA = resA_all['response'][i]
        resB = resB_all['response'][i]
        if len(resA) > 2000 or len(resB) > 2000:
            continue
        eval_prompt = gpt_evaluation_prompt_generator(prompt=prompt, response_A=resA, response_B=resB)
        GPT_response = GPT_completion(msg_to_gpt=eval_prompt)
        # ---------- Record and extact GPT_response
        f.write(f'  #======== Question {i}: {prompt[:40]}\n')
        f.write(f'  #======== GPT eval: \n {GPT_response}\n')
        GPT_answer = GPT_response.split(': ')[-1]
        all_cnt += 1
        if GPT_answer == 'A':
            A_win_cnt += 1
        if GPT_answer == 'B':
            B_win_cnt += 1
    f.write(f'#======== Model A win {A_win_cnt}/{all_cnt} times, model B win {B_win_cnt}/{all_cnt} ==========\n')
    f.write('################### EVALUATION ENDS #################\n')
wra = (A_win_cnt)/(A_win_cnt+B_win_cnt)
wrb = (B_win_cnt)/(A_win_cnt+B_win_cnt)
print(f'#======== Model A win {A_win_cnt}/{all_cnt} times, model B win {B_win_cnt}/{all_cnt}, A win {wra}, B win {wrb} ==========\n')

 46%|████▌     | 231/500 [04:47<05:34,  1.24s/it]


InternalServerError: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}}

In [8]:
print(f'#======== Model A win {A_win_cnt}/{all_cnt} times, model B win {B_win_cnt}/{all_cnt} ==========\n')




In [None]:
wra = (A_win_cnt)/(A_win_cnt+B_win_cnt)
wrb = (B_win_cnt)/(A_win_cnt+B_win_cnt)
print(f'#======== Model A win {A_win_cnt}/{all_cnt} times, model B win {B_win_cnt}/{all_cnt}, A win {wra}, B win {wrb} ==========\n')