In [222]:
import openai
import urllib.request, json 
import string, re
import random
import time
import traceback

with open('API_key.txt', 'r') as file:
    openai.api_key = file.readline().strip()

In [97]:
data_path = "data/bamboogle.json"
with open(data_path, 'r') as f:
    data = json.load(f)
len(data)

125

In [4]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(str(s)))))

def extract_answer(generated):
    if '\n' not in generated:
        last_line =  generated
    else: 
        last_line = generated.split('\n')[-1]

    if ':' not in last_line:
        after_colon = last_line
    else:
        after_colon = generated.split(':')[-1]
    after_colon = after_colon.strip()
    if not after_colon.strip():
        return ""
    return normalize_answer(after_colon)

In [227]:
prompt_prefix = "Answer the question I ask in a format AS CLOSE AS POSSIBLE to the following examples. In particular, state the final answer exactly in the format indicated. DO NOT attempt a more nuanced answer, even for difficult questions, or provide more text than the following example answers for any reason. Examples: \n\n"
system_prompt = f"You are ChatGPT, a large language model trained by OpenAI, based on the GPT-3.5 architecture.\nKnowledge cutoff: 2021-09\nCurrent date: 2023-04-21"

In [215]:
def compute_is_right_EM(prediction, real_answers):
    # might be multiple acceptable answers (this appears rare from my initial scan)
    assert type(real_answers) == list
    # should be cleaned already, do it again just in case
    clean_pred = normalize_answer(prediction)
    is_right = False
    for potential_answer in real_answers:
        potential_answer_clean = normalize_answer(potential_answer)
        if clean_pred == potential_answer_clean:
            is_right = True
    return is_right

def compute_is_right_cover_EM(prediction, real_answers):
    # might be multiple acceptable answers (this appears rare from my initial scan)
    assert type(real_answers) == list
    # should be cleaned already, do it again just in case
    clean_pred = normalize_answer(prediction)
    is_right = False
    for potential_answer in real_answers:
        potential_answer_clean = normalize_answer(potential_answer)
        if potential_answer_clean in clean_pred:
            is_right = True

    return is_right

# def compute_is_right_GPT_opinion(question, prediction, real_answers):
# #     prompt = f"""Your task is to compare a language model's response to a question with one or more acceptable ground truth answers to said question.
# # VERY IMPORTANT: Numerical or ID-code answers MUST MATCH EXACTLY (up to differences in units). For other response types you should interpet responses for semantic similarity to ground truth."""
# #     prompt += f"Here is the question that was asked: \"{question}\"\n"
# #     prompt += f"Here are the acceptable answer(s) for this question: [\"{real_answers[0]}\""
# #     for answer in real_answers[1:]:
# #         prompt += f", \"{answer}\""
# #     prompt += "]\n"
# #     prompt += f"Here is the response of the model which you to are to evaluate: \"{prediction}\"\n"
# #     prompt += """Respond \"CORRECT\" if the model's response matches one of the ground truths and \"INCORRECT\" otherwise.
# # If the model disagrees with the premise of the question or claims there's no answer, you should consider it to be INCORRECT.
# # You should include reasoning in your response, and your response must include \"CORRECT\" or \"INCORRECT\".
# # VERY IMPORTANT: You MUST IGNORE your own opinion on the question's validity or its answer. ONLY COMPARE THE MODEL'S RESPONSE WITH THE GIVEN GROUND TRUTHS."""

#     prompt = f"""Your task is to determine whether a language model has gotten the correct answer to a question. You should only consider responses to be correct
#     if they include the ground truth answer exactly, but it doesn't have to be a perfect substring match or perfect spelling as long as the response
#     is clearly referring to the same thing as the ground truth answer."""
#     # prompt += f"Here is the question that was asked: \"{question}\"\n"
#     prompt += f"Here is the ground truth answer(s) for this question: [\"{real_answers[0]}\""
#     for answer in real_answers[1:]:
#         prompt += f", \"{answer}\""
#     prompt += "]\n"
#     prompt += f"Here is the response of the model which you to are to evaluate: \"{prediction}\"\n"
#     prompt += """Respond \"CORRECT\" if the model's response matches one of the ground truths and \"INCORRECT\" otherwise."""
#     # print(prompt)
#     # print(prompt)
#     response = openai.ChatCompletion.create(
#         model="gpt-3.5-turbo",
#         max_tokens=200,
#         stop='\n\n',
#         messages=[
#             {"role": "system", "content": system_prompt},
#             {"role": "user", "content": prompt}
#         ],
#         temperature=0
#     )
    
#     return "INCORRECT" not in response['choices'][0]["message"]["content"]

def DA_compute_is_right_GPT_opinion(question, prediction, real_answers):
    prompt = f"""Your task is to determine whether a language model has gotten the correct answer to a question. You should only consider responses to be correct if they are roughly of the form \"the final answer is: <answer>\" or just \"<answer>\" without any other text or reasoning and <answer> matches the ground truth answer.
    Note that the answer needn't have the same spelling and formatting as the ground truth to be correct long as as long as they're clearly referring to the same thing, but if the response contains ANYTHING else, such as reasoning or explanation, it should be considered incorrect because we explicitly told the model not to include explanation."""
    # prompt += f"Here is the question that was asked: \"{question}\"\n"
    prompt += f"Here is the ground truth answer(s) for this question: [\"{real_answers[0]}\""
    for answer in real_answers[1:]:
        prompt += f", \"{answer}\""
    prompt += "]\n"
    prompt += f"Here is the response of the model which you to are to evaluate: \"{prediction}\"\n"
    prompt += """Respond \"CORRECT\" if the model's response matches the ground truth according to the metric I've outlined and \"INCORRECT\" otherwise."""
    # print(prompt)
    # print(prompt)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        max_tokens=200,
        stop='\n\n',
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    
    return "INCORRECT" not in response['choices'][0]["message"]["content"]

def COT_compute_is_right_GPT_opinion(question, prediction, real_answers):
    prompt = f"""Your task is to determine whether a language model has gotten the correct answer to a question. You should only consider responses to be correct
    if they include the ground truth answer exactly, but it doesn't have to be a perfect substring match or perfect spelling as long as the response
    is clearly referring to the same thing as the ground truth answer. It is acceptable (and expected) for the response to contain other text such as reasoning."""
    # prompt += f"Here is the question that was asked: \"{question}\"\n"
    prompt += f"Here is the ground truth answer(s) for this question: [\"{real_answers[0]}\""
    for answer in real_answers[1:]:
        prompt += f", \"{answer}\""
    prompt += "]\n"
    prompt += f"Here is the response of the model which you to are to evaluate: \"{prediction}\"\n"
    prompt += """Respond \"CORRECT\" if the model's response matches the ground truth according to the metric I've outlined and \"INCORRECT\" otherwise."""
    # print(prompt)
    # print(prompt)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        max_tokens=200,
        stop='\n\n',
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    
    return "INCORRECT" not in response['choices'][0]["message"]["content"]

def SA_compute_is_right_GPT_opinion(question, prediction, real_answers):
    prompt = f"""Your task is to determine whether a language model has gotten the correct answer to a question. You should only consider responses to be correct
    if they include the ground truth answer exactly, but it doesn't have to be a perfect substring match or perfect spelling as long as the response
    is clearly referring to the same thing as the ground truth answer. It is acceptable (and expected) for the response to contain other text, likely
    a series of subquestions and their answers that assist in answering the original question."""
    # prompt += f"Here is the question that was asked: \"{question}\"\n"
    prompt += f"Here is the ground truth answer(s) for this question: [\"{real_answers[0]}\""
    for answer in real_answers[1:]:
        prompt += f", \"{answer}\""
    prompt += "]\n"
    prompt += f"Here is the response of the model which you to are to evaluate: \"{prediction}\"\n"
    prompt += """Respond \"CORRECT\" if the model's response matches the ground truth according to the metric I've outlined and \"INCORRECT\" otherwise."""
    # print(prompt)
    # print(prompt)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        max_tokens=200,
        stop='\n\n',
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    
    return "INCORRECT" not in response['choices'][0]["message"]["content"]



In [229]:
def run_LM_full_question(prompt, sample, current_model, compute_is_right_GPT_opinion, extraction=extract_answer, max_tokens=450, start='Answer:'):
    question = sample['Question']
    cur_prompt = prompt_prefix + prompt + '\n' + '\n' + 'Question: ' + question + '\n' + start
    # cur_prompt = prompt + '\n' + '\n' + 'Question: ' + question + '\n' + start

    if current_model in ["gpt-3.5-turbo", "gpt-4"]:
        ans = openai.ChatCompletion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop='\n\n',
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": cur_prompt}
            ]
        )
        response_text = ans['choices'][0]["message"]["content"]
    else:
        ans = openai.Completion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop='\n\n',
            prompt=cur_prompt,
            temperature=0
        )
        response_text = ans['choices'][0]['text']
    failure_response = {
        "question": sample['Question'],
        "prompt": cur_prompt,
        "answer": sample['Answer'],
        "returned": response_text,
        "is_right_EM": False,
        "is_right_CEM": False,
        "is_right_GPT_opinion": False
    }
    if response_text.strip() == '':
        return failure_response
    # print("not empty: " + response_text.strip())
    clean_ans = extraction(response_text)
    # print("clean answer: " + clean_ans)
    if clean_ans.strip() == '':
        return failure_response
    is_right_EM = compute_is_right_EM(clean_ans, [sample['Answer']])
    is_right_CEM = compute_is_right_cover_EM(response_text, [sample['Answer']])
    is_right_GPT_opinion = compute_is_right_GPT_opinion(question, response_text, [sample['Answer']])
    jsonres = {
        "question": sample['Question'],
        "prompt": cur_prompt,
        "answer": sample['Answer'],
        "returned": response_text,
        "is_right_EM": is_right_EM,
        "is_right_CEM": is_right_CEM,
        "is_right_GPT_opinion": is_right_GPT_opinion
    }
    return jsonres

In [238]:
def run_LM_subquestions(prompt_1, prompt_2, sample, current_model, compute_is_right_GPT_opinion, extraction=extract_answer, max_tokens=250, start='Answer:'):
    prompt1_with_q = prompt_1 + '\n' + '\n' + 'Question: ' + sample['Q1'] + '\n' + start
    prompt2_with_q = prompt_2 + '\n' + '\n' + 'Question: ' + sample['Q2'] + '\n' + start


    prompt1_with_q = prompt_prefix + prompt_1 + '\n' + '\n' + 'Question: ' + sample['Q1'] + '\n' + start
    prompt2_with_q = prompt_prefix + prompt_2 + '\n' + '\n' + 'Question: ' + sample['Q2'] + '\n' + start

    if current_model in ["gpt-3.5-turbo", "gpt-4"]:
        q1_response = openai.ChatCompletion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop="\n\n",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt1_with_q}
            ],
            temperature=0
        )
        ans1 = q1_response['choices'][0]["message"]["content"]
        
        q2_response = openai.ChatCompletion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop="\n\n",
            messages=[
                {"role": "system", "content":  system_prompt},
                {"role": "user", "content": prompt2_with_q}
            ],
            temperature=0
        )
        ans2 = q2_response['choices'][0]["message"]["content"]
    else:
        q1_response = openai.Completion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop="\n\n",
            prompt=prompt1_with_q,
            temperature=0
        )
        ans1 = q1_response['choices'][0]['text']

        q2_response = openai.Completion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop="\n\n",
            prompt=prompt2_with_q,
            temperature=0
        )
        ans2 = q2_response['choices'][0]['text']

    q1_clean_ans = extraction(ans1)
    q1_is_right_EM = compute_is_right_EM(q1_clean_ans, sample['A1'])
    q1_is_right_CEM = compute_is_right_cover_EM(ans1, sample['A1'])
    q1_is_right_GPT_opinion = compute_is_right_GPT_opinion(sample["Q1"], ans1, [sample['A1']])


    q2_clean_ans = extraction(ans2)
    q2_is_right_EM = compute_is_right_EM(q2_clean_ans, sample['A2'])
    q2_is_right_CEM = compute_is_right_cover_EM(ans2, sample['A2'])
    q2_is_right_GPT_opinion = compute_is_right_GPT_opinion(sample["Q2"], ans2, [sample['A2']])


    jsonres = {
        "Q1": sample['Q1'],
        "Q1_gt": sample['A1'],
        "Q1_pred": q1_clean_ans,
        "Q1_is_right_EM": q1_is_right_EM,
        "Q1_is_right_CEM": q1_is_right_CEM,
        "Q1_is_right_GPT_opinion": q1_is_right_GPT_opinion,
        "Q2": sample['Q2'],
        "Q2_gt": sample['A2'],
        "Q2_pred": q2_clean_ans,
        "Q2_is_right_EM": q2_is_right_EM,
        "Q2_is_right_CEM": q2_is_right_CEM,
        "Q2_is_right_GPT_opinion": q2_is_right_GPT_opinion,
        "Q1_and_Q2_right_EM": q1_is_right_EM and q2_is_right_EM,
        "Q1_and_Q2_right_CEM": q1_is_right_CEM and q2_is_right_CEM,
        "Q1_and_Q2_right_GPT_opinion":  q2_is_right_GPT_opinion and q2_is_right_GPT_opinion,
        "Q1_prompt": prompt1_with_q,
        "Q2_prompt": prompt2_with_q,
        'Q1_returned': ans1,
        'Q2_returned': ans2
    }

    return jsonres


In [231]:
def update_results_dict(results, subquestions_result, direct_answer_result, chain_of_thought_result, self_ask_result):
    metrics = ["EM", "CEM", "GPT_opinion"]

    for metric in metrics:
        if subquestions_result[f"Q1_and_Q2_right_{metric}"]:
            results["summary"][f"both_subquestions_correct_{metric}"] += 1

            if not direct_answer_result[f"is_right_{metric}"]:
                results["summary"][f"full_question_direct_answer_wrong_when_subquestions_correct_{metric}"] += 1

            if not chain_of_thought_result[f"is_right_{metric}"]:
                results["summary"][f"full_question_chain_of_thought_wrong_when_subquestions_correct_{metric}"] += 1

            if not self_ask_result[f"is_right_{metric}"]:
                results["summary"][f"full_question_self_ask_wrong_when_subquestions_correct_{metric}"] += 1

        if direct_answer_result[f"is_right_{metric}"]:
            results["summary"][f"full_question_direct_answer_correct_{metric}"] += 1
            if not subquestions_result[f"Q1_and_Q2_right_{metric}"]:
                results["summary"][f"subquestions_wrong_when_full_question_direct_answer_correct_{metric}"] += 1

        if chain_of_thought_result[f"is_right_{metric}"]:
            results["summary"][f"full_question_chain_of_thought_correct_{metric}"] += 1
            if not subquestions_result[f"Q1_and_Q2_right_{metric}"]:
                results["summary"][f"subquestions_wrong_when_full_question_chain_of_thought_correct_{metric}"] += 1

        if self_ask_result[f"is_right_{metric}"]:
            results["summary"][f"full_question_self_ask_correct_{metric}"] += 1
            if not subquestions_result[f"Q1_and_Q2_right_{metric}"]:
                results["summary"][f"subquestions_wrong_when_full_question_self_ask_correct_{metric}"] += 1

    results["per_question_results"].append({
        "subquestions_result": subquestions_result,
        "direct_answer_result": direct_answer_result,
        "chain_of_thought_result": chain_of_thought_result,
        "self_ask_result": self_ask_result
    })


In [232]:

# Load the combined dictionary from the JSON file
with open("data/bamboogle_prompts.json", "r") as json_file:
    all_prompts = json.load(json_file)

# Extract the individual dictionaries
chain_of_thought_prompt = all_prompts["chain_of_thought_prompt"]
self_ask_prompt = all_prompts["self_ask_prompt"]
direct_answer_prompt = all_prompts["direct_answer_prompt"]
subquestions_prompt = all_prompts["subquestions_prompt"]
# subquestion_1_prompt_dict = all_prompts["subquestion_1_prompt_dict"]
# subquestion_2_prompt_dict = all_prompts["subquestion_2_prompt_dict"]


In [239]:
i = 0
results = {
    "per_question_results": [],
    "summary": {
        "full_question_direct_answer_correct_EM": 0,
        "full_question_direct_answer_correct_CEM": 0,
        "full_question_direct_answer_correct_GPT_opinion": 0,
        "full_question_direct_answer_correct_human_opinion": 0,
        "full_question_chain_of_thought_correct_EM": 0,
        "full_question_chain_of_thought_correct_CEM": 0,
        "full_question_chain_of_thought_correct_GPT_opinion": 0,
        "full_question_chain_of_thought_correct_human_opinion": 0,
        "full_question_self_ask_correct_EM": 0,
        "full_question_self_ask_correct_CEM": 0,
        "full_question_self_ask_correct_GPT_opinion": 0,
        "full_question_self_ask_correct_human_opinion": 0,
        "both_subquestions_correct_EM": 0,
        "both_subquestions_correct_CEM": 0,
        "both_subquestions_correct_GPT_opinion": 0,
        "both_subquestions_correct_human_opinion": 0,
        "subquestions_wrong_when_full_question_direct_answer_correct_EM": 0,
        "subquestions_wrong_when_full_question_direct_answer_correct_CEM": 0,
        "subquestions_wrong_when_full_question_direct_answer_correct_GPT_opinion": 0,
        "subquestions_wrong_when_full_question_direct_answer_correct_human_opinion": 0,
        "subquestions_wrong_when_full_question_chain_of_thought_correct_EM": 0,
        "subquestions_wrong_when_full_question_chain_of_thought_correct_CEM": 0,
        "subquestions_wrong_when_full_question_chain_of_thought_correct_GPT_opinion": 0,
        "subquestions_wrong_when_full_question_chain_of_thought_correct_human_opinion": 0,
        "subquestions_wrong_when_full_question_self_ask_correct_EM": 0,
        "subquestions_wrong_when_full_question_self_ask_correct_CEM": 0,
        "subquestions_wrong_when_full_question_self_ask_correct_GPT_opinion": 0,
        "subquestions_wrong_when_full_question_self_ask_correct_human_opinion": 0,
        "full_question_direct_answer_wrong_when_subquestions_correct_EM": 0,
        "full_question_direct_answer_wrong_when_subquestions_correct_CEM": 0,
        "full_question_direct_answer_wrong_when_subquestions_correct_GPT_opinion": 0,
        "full_question_direct_answer_wrong_when_subquestions_correct_human_opinion": 0,
        "full_question_chain_of_thought_wrong_when_subquestions_correct_EM": 0,
        "full_question_chain_of_thought_wrong_when_subquestions_correct_CEM": 0,
        "full_question_chain_of_thought_wrong_when_subquestions_correct_GPT_opinion": 0,
        "full_question_chain_of_thought_wrong_when_subquestions_correct_human_opinion": 0,
        "full_question_self_ask_wrong_when_subquestions_correct_EM": 0,
        "full_question_self_ask_wrong_when_subquestions_correct_CEM": 0,
        "full_question_self_ask_wrong_when_subquestions_correct_GPT_opinion": 0,
        "full_question_self_ask_wrong_when_subquestions_correct_human_opinion": 0,
    }
}

In [243]:
model = "gpt-3.5-turbo"
while i < len(data):
    dp = data[i]
    # print(dp)
    try:
        # category = dp['category']

        # Direct answer
        direct_answer_prompt = direct_answer_prompt
        direct_answer_result = run_LM_full_question(direct_answer_prompt, dp, model, DA_compute_is_right_GPT_opinion)
        # if (not direct_answer_result["is_right_CEM"] and "INCORRECT" not in direct_answer_result["is_right_GPT_opinion"]) \
        #     or (direct_answer_result["is_right_CEM"] and "INCORRECT" in direct_answer_result["is_right_GPT_opinion"]):
        #     for key in direct_answer_result.keys():
        #         print(f"{key}: {direct_answer_result[key]}")

        # for key in direct_answer_result.keys():
        #     print(f"{key}: {direct_answer_result[key]}")


        # Chain of thought
        chain_of_thought_prompt = chain_of_thought_prompt
        chain_of_thought_result = run_LM_full_question(chain_of_thought_prompt, dp, model, COT_compute_is_right_GPT_opinion)
        # if (not chain_of_thought_result["is_right_CEM"] and "INCORRECT" not in chain_of_thought_result["is_right_GPT_opinion"]) or \
        #     (chain_of_thought_result["is_right_CEM"] and "INCORRECT" in chain_of_thought_result["is_right_GPT_opinion"]):
        #     for key in chain_of_thought_result.keys():
        #         print(f"{key}: {chain_of_thought_result[key]}")
        # for key in chain_of_thought_result.keys():
        #     print(f"{key}: {chain_of_thought_result[key]}")
        # print("\n\n\n\n")

        # Self-ask
        self_ask_prompt = self_ask_prompt
        self_ask_result = run_LM_full_question(self_ask_prompt, dp, model, SA_compute_is_right_GPT_opinion, start='Are follow up questions needed here: Yes.\n')
        # print(self_ask_result["Prediction"])
        # print(self_ask_result["Answer"])

        # Subquestions
        subquestion_1_prompt = subquestions_prompt
        subquestion_2_prompt = subquestions_prompt
        subquestions_result = run_LM_subquestions(subquestion_1_prompt, subquestion_2_prompt, dp, model, DA_compute_is_right_GPT_opinion, extraction=extract_answer)
        update_results_dict(results, subquestions_result, direct_answer_result, chain_of_thought_result, self_ask_result)
        print(i)
        i += 1
    except Exception as e:
        print("error: ", e)
        traceback.print_exc()

with open(f'results/Bamboogle_results_{model}_prefixed.json', 'w') as outfile:
    json.dump(results, outfile, ensure_ascii=False, indent=4)


3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
error:  The server had an error while processing your request. Sorry about that!


Traceback (most recent call last):
  File "/tmp/ipykernel_419/793832120.py", line 33, in <module>
    self_ask_result = run_LM_full_question(self_ask_prompt, dp, model, SA_compute_is_right_GPT_opinion, start='Are follow up questions needed here: Yes.\n')
  File "/tmp/ipykernel_419/3102365094.py", line 44, in run_LM_full_question
    is_right_GPT_opinion = compute_is_right_GPT_opinion(question, response_text, [sample['Answer']])
  File "/tmp/ipykernel_419/1928888727.py", line 130, in SA_compute_is_right_GPT_opinion
    response = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.

34
35
36
37
38
39
error:  Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Mon, 24 Apr 2023 03:32:47 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7bcb47eb2c096826-SEA'}


Traceback (most recent call last):
  File "/tmp/ipykernel_419/793832120.py", line 40, in <module>
    subquestions_result = run_LM_subquestions(subquestion_1_prompt, subquestion_2_prompt, dp, model, DA_compute_is_right_GPT_opinion, extraction=extract_answer)
  File "/tmp/ipykernel_419/1857054115.py", line 55, in run_LM_subquestions
    q1_is_right_GPT_opinion = compute_is_right_GPT_opinion(sample["Q1"], ans1, [sample['A1']])
  File "/tmp/ipykernel_419/1928888727.py", line 77, in DA_compute_is_right_GPT_opinion
    response = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py",

error:  Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Mon, 24 Apr 2023 03:32:52 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7bcb48030f786826-SEA'}


Traceback (most recent call last):
  File "/tmp/ipykernel_419/793832120.py", line 10, in <module>
    direct_answer_result = run_LM_full_question(direct_answer_prompt, dp, model, DA_compute_is_right_GPT_opinion)
  File "/tmp/ipykernel_419/3102365094.py", line 44, in run_LM_full_question
    is_right_GPT_opinion = compute_is_right_GPT_opinion(question, response_text, [sample['Answer']])
  File "/tmp/ipykernel_419/1928888727.py", line 77, in DA_compute_is_right_GPT_opinion
    response = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 226, in request
    resp, got_stre

40
41
42
43
44
45
46
47
48
49
50
51
52
error:  The server had an error while processing your request. Sorry about that!


Traceback (most recent call last):
  File "/tmp/ipykernel_419/793832120.py", line 33, in <module>
    self_ask_result = run_LM_full_question(self_ask_prompt, dp, model, SA_compute_is_right_GPT_opinion, start='Are follow up questions needed here: Yes.\n')
  File "/tmp/ipykernel_419/3102365094.py", line 44, in run_LM_full_question
    is_right_GPT_opinion = compute_is_right_GPT_opinion(question, response_text, [sample['Answer']])
  File "/tmp/ipykernel_419/1928888727.py", line 130, in SA_compute_is_right_GPT_opinion
    response = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.

53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
error:  The server had an error while processing your request. Sorry about that!


Traceback (most recent call last):
  File "/tmp/ipykernel_419/793832120.py", line 33, in <module>
    self_ask_result = run_LM_full_question(self_ask_prompt, dp, model, SA_compute_is_right_GPT_opinion, start='Are follow up questions needed here: Yes.\n')
  File "/tmp/ipykernel_419/3102365094.py", line 44, in run_LM_full_question
    is_right_GPT_opinion = compute_is_right_GPT_opinion(question, response_text, [sample['Answer']])
  File "/tmp/ipykernel_419/1928888727.py", line 130, in SA_compute_is_right_GPT_opinion
    response = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.

69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
error:  Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)


Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 421, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 416, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/usr/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/usr/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/usr/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/usr/lib/python3.9/ssl.py", line 1242, in recv_into
    return self.read(nbytes, buffer)
  File "/usr/lib/python3.9/ssl.py", line 1100, in read
    return self._sslobj.read(len, buffer)
socket.timeout: The read operation timed 

96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124


In [242]:
def save():
    with open(f'results/Bamboogle_results_{model}_prefixed.json', 'w') as outfile:
        json.dump(results, outfile, ensure_ascii=False, indent=4)
save()

In [151]:
with open(f'results/Bamboogle_results_{model}.json', 'r') as infile:
    # json.dump(results, outfile, ensure_ascii=False, indent=4)
    results = json.load(infile)
print(results.keys())

dict_keys(['per_question_results', 'summary'])


In [158]:
i = 0

In [162]:
print(i)

8


In [None]:
while i < len(results['per_question_results']):
    dp = results['per_question_results'][i]
    # pretty_json = json.dumps(dp["subquestions_result"], indent=4)
    # dp["subquestions_result"]["Q2_is_right_CEM"] = compute_is_right_cover_EM(dp["subquestions_result"]["Q2_returned"], dp["subquestions_result"]["Q2_gt"])
    # dp["subquestions_result"]["Q1_and_Q2_right_CEM"] = dp["subquestions_result"]["Q2_is_right_CEM"] and dp["subquestions_result"]["Q1_is_right_CEM"]
    # if dp["subquestions_result"]["Q1_and_Q2_right_CEM"]:
    #     count += 1
    # Print the JSON string
    # print(pretty_json)
    print("Q: " + dp["direct_answer_result"]["question"])
    print("DA: " + dp["direct_answer_result"]["returned"])
    print("COT: " + dp["chain_of_thought_result"]["returned"])
    print("SA: " + dp["self_ask_result"]["returned"])
    print("GT: " + dp["direct_answer_result"]["answer"])
    print("\n\n")
    time.sleep(1)

    human_opinion = input(f"{dp['direct_answer_result']['answer']} DA correct?")
    dp["direct_answer_result"]["is_right_human_opinion"] = "t" in human_opinion.lower()
    save()

    human_opinion = input(f"{dp['direct_answer_result']['answer']} COT correct?")
    dp["chain_of_thought_result"]["is_right_human_opinion"] = "t" in human_opinion.lower()
    save()

    human_opinion = input(f"{dp['direct_answer_result']['answer']} SA correct?")
    dp["self_ask_result"]["is_right_human_opinion"] = "t" in human_opinion.lower()
    save()

    print("\n\n")
    i += 1


# results['summary']['both_subquestions_correct_CEM'] = count

Q: Can people who have celiac eat camel meat?
DA: Yes, people with celiac disease can eat camel meat as it does not contain gluten, which is the protein that they need to avoid. However, it's important to make sure the meat is not contaminated during preparation or cooking processes. As with any dietary restrictions or concerns, it's always best to consult a doctor or a registered dietitian.
COT: There is not enough information to provide a definitive answer. However, if camel meat has not been processed with gluten-containing ingredients or contaminated with gluten during preparation, it is possible that people with celiac disease can safely eat camel meat. Nonetheless, it is advisable for individuals with celiac disease to consult with a healthcare professional or a registered dietitian before consuming any food if they are unsure about its gluten content.
SA: Follow up: Can camel meat contain gluten?
Intermediate answer: It is possible for camel meat to be contaminated with gluten i

KeyboardInterrupt: Interrupted by user

In [250]:
import json

# Specify the paths to the JSON files for the two versions
json_file_path1 = f'results/Bamboogle_results_{model}.json'
json_file_path2 = f'results/Bamboogle_results_{model}_prefixed.json'

# Read in the data from the JSON files
with open(json_file_path1, "r") as json_file:
    data1 = json.load(json_file)

with open(json_file_path2, "r") as json_file:
    data2 = json.load(json_file)

# Extract the per-question results from each version
per_question_results1 = data1["per_question_results"]
per_question_results2 = data2["per_question_results"]

# Compare the per-question results and print out differences in correct answers
for i in range(len(per_question_results1)):
    subquestions_result1 = per_question_results1[i]["subquestions_result"]
    direct_answer_result1 = per_question_results1[i]["direct_answer_result"]
    chain_of_thought_result1 = per_question_results1[i]["chain_of_thought_result"]
    self_ask_result1 = per_question_results1[i]["self_ask_result"]

    subquestions_result2 = per_question_results2[i]["subquestions_result"]
    direct_answer_result2 = per_question_results2[i]["direct_answer_result"]
    chain_of_thought_result2 = per_question_results2[i]["chain_of_thought_result"]
    self_ask_result2 = per_question_results2[i]["self_ask_result"]

    # if (
    #     (direct_answer_result1["is_right_EM"] and not direct_answer_result2["is_right_EM"]) or
    #     (direct_answer_result2["is_right_EM"] and not direct_answer_result1["is_right_EM"])
    # ):
    #     print(f"Question {i}: One version got direct answer correct, the other did not")

    # if (
    #     (chain_of_thought_result1["is_right_EM"] and not chain_of_thought_result2["is_right_EM"]) or
    #     (chain_of_thought_result2["is_right_EM"] and not chain_of_thought_result1["is_right_EM"])
    # ):
    #     print(f"Question {i}: One version got chain of thought correct, the other did not")
    pretty_json_1 = json.dumps(per_question_results1[i], indent=4)
    pretty_json_2 = json.dumps(per_question_results2[i], indent=4)
    if (
        (self_ask_result1["is_right_GPT_opinion"] and not self_ask_result2["is_right_GPT_opinion"]) 
        # or (self_ask_result2["is_right_EM"] and not self_ask_result1["is_right_EM"])
    ):
        print(f"Question {i}: One version got self-ask correct, the other did not")
        print(pretty_json_1)
        print(pretty_json_2)
        print("\n\n\n")



Question 6: One version got self-ask correct, the other did not
{
    "Q1": "Who set the precedent of a two term limit?",
    "Q1_gt": [
        "george washington"
    ],
    "Q1_pred": "precedent of twoterm limit for president of united states was set by george washington who voluntarily stepped down after serving two terms in office this precedent was later enshrined in 22nd amendment to us constitution",
    "Q1_is_right_EM": false,
    "Q1_is_right_CEM": true,
    "Q1_is_right_GPT_opinion": false,
    "Q2": "When did George Washington leave office?",
    "Q2_gt": [
        "march 4, 1797"
    ],
    "Q2_pred": "george washington left office on march 4 1797",
    "Q2_is_right_EM": false,
    "Q2_is_right_CEM": true,
    "Q2_is_right_GPT_opinion": false,
    "Q1_and_Q2_right_EM": false,
    "Q1_and_Q2_right_CEM": true,
    "Q1_and_Q2_right_GPT_opinion": false,
    "Q1_prompt": "Question: How old was Theodor Haecker when he died?\nSo the final answer is: 65 years\nQuestion: Why did G