In [34]:
!pip install openai



In [27]:
import openai
import urllib.request, json 
import string, re
import random

with open('ofir_API_key.txt', 'r') as file:
    openai.api_key = file.readline().strip()

In [28]:
data_path = "data/compositional_celebrities_subset.json"
with open(data_path, 'r') as f:
    data = json.load(f)["data"]
len(data)

1000

In [29]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(str(s)))))

def extract_answer(generated):
    if '\n' not in generated:
        last_line =  generated
    else: 
        last_line = generated.split('\n')[-1]

    if ':' not in last_line:
        after_colon = last_line
    else:
        after_colon = generated.split(':')[-1]
    after_colon = after_colon.strip()
    if not after_colon.strip():
        return ""
    return normalize_answer(after_colon)

In [4]:
def compute_is_right_EM(prediction, real_answers):
    # might be multiple acceptable answers (this appears rare from my initial scan)
    assert type(real_answers) == list
    # should be cleaned already, do it again just in case
    clean_pred = normalize_answer(prediction)
    is_right = False
    for potential_answer in real_answers:
        potential_answer_clean = normalize_answer(potential_answer)
        if clean_pred == potential_answer_clean:
            is_right = True
    return is_right

def compute_is_right_cover_EM(prediction, real_answers):
    # might be multiple acceptable answers (this appears rare from my initial scan)
    assert type(real_answers) == list
    # should be cleaned already, do it again just in case
    clean_pred = normalize_answer(prediction)
    is_right = False
    for potential_answer in real_answers:
        potential_answer_clean = normalize_answer(potential_answer)
        if potential_answer_clean in clean_pred:
            is_right = True

    return is_right

In [30]:
prompt_prefix = "Answer the question I ask in a format as close as possible to the following examples. State the final answer portion AS CONCISELY AS POSSIBLE.\n\n"
system_prompt = f"You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.\nKnowledge cutoff: 2021-09\nCurrent date: 2023-04-18"

In [36]:
def run_LM_full_question(prompt, sample, current_model, extraction=extract_answer, max_tokens=450, start='Answer:'):
    question = sample['Question']
    # cur_prompt = prompt_prefix + prompt + '\n' + '\n' + 'Question: ' + question + '\n' + start
    cur_prompt = prompt + '\n' + '\n' + 'Question: ' + question + '\n' + start

    if current_model in ["gpt-3.5-turbo", "gpt-4"]:
        ans = openai.ChatCompletion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop='\n\n',
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": cur_prompt}
            ]
        )
        response_text = ans['choices'][0]["message"]["content"]
    else:
        ans = openai.Completion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop='\n\n',
            prompt=cur_prompt,
            temperature=0
        )
        response_text = ans['choices'][0]['text']
    failure_response = {
        "Question": sample['Question'],
        "Answer": sample['Answer'],
        "Prediction": response_text,
        "Cleaned_pred": "",
        "is_right_EM": False,
        "is_right_CEM": False
    }
    if response_text.strip() == '':
        return failure_response
    # print("not empty: " + response_text.strip())
    clean_ans = extraction(response_text)
    # print("clean answer: " + clean_ans)
    if clean_ans.strip() == '':
        return failure_response
    is_right_EM = compute_is_right_EM(clean_ans, sample['Answer'])
    is_right_CEM = compute_is_right_cover_EM(clean_ans, sample['Answer'])
    jsonres = {
        "question": sample['Question'],
        "prompt": cur_prompt,
        "answer": sample['Answer'],
        "returned": response_text,
        "cleaned_pred": clean_ans,
        "is_right_EM": is_right_EM,
        "is_right_CEM": is_right_CEM
    }
    return jsonres

In [35]:
def run_LM_subquestions(prompt_1, prompt_2, sample, current_model, extraction=extract_answer, max_tokens=250, start='Answer:'):
    prompt1_with_q = prompt_1 + '\n' + '\n' + 'Question: ' + sample['Q1'] + '\n' + start
    prompt2_with_q = prompt_2 + '\n' + '\n' + 'Question: ' + sample['Q2'] + '\n' + start

    if current_model in ["gpt-3.5-turbo", "gpt-4"]:
        q1_response = openai.ChatCompletion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop="\n\n",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt1_with_q}
            ],
            temperature=0
        )
        ans1 = q1_response['choices'][0]["message"]["content"]
        
        q2_response = openai.ChatCompletion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop="\n\n",
            messages=[
                {"role": "system", "content":  system_prompt},
                {"role": "user", "content": prompt2_with_q}
            ],
            temperature=0
        )
        ans2 = q2_response['choices'][0]["message"]["content"]
    else:
        q1_response = openai.Completion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop="\n\n",
            prompt=prompt1_with_q,
            temperature=0
        )
        ans1 = q1_response['choices'][0]['text']

        q2_response = openai.Completion.create(
            model=current_model,
            max_tokens=max_tokens,
            stop="\n\n",
            prompt=prompt2_with_q,
            temperature=0
        )
        ans2 = q2_response['choices'][0]['text']

    q1_clean_ans = extraction(ans1)
    q1_is_right_EM = compute_is_right_EM(q1_clean_ans, sample['A1'])
    q1_is_right_CEM = compute_is_right_cover_EM(q1_clean_ans, sample['A1'])

    q2_clean_ans = extraction(ans2)
    q2_is_right_EM = compute_is_right_EM(q2_clean_ans, sample['A2'])
    q2_is_right_CEM = compute_is_right_cover_EM(q2_clean_ans, sample['A2'])

    jsonres = {
        "Q1": sample['Q1'],
        "Q1_gt": sample['A1'],
        "Q1_pred": q1_clean_ans,
        "Q1_is_right_EM": q1_is_right_EM,
        "Q1_is_right_CEM": q1_is_right_CEM,
        "Q2": sample['Q2'],
        "Q2_gt": sample['A2'],
        "Q2_pred": q2_clean_ans,
        "Q2_is_right_EM": q2_is_right_EM,
        "Q2_is_right_CEM": q2_is_right_CEM,
        "Q1_and_Q2_right_EM": q1_is_right_EM and q2_is_right_EM,
        "Q1_and_Q2_right_CEM": q1_is_right_CEM and q2_is_right_CEM,
        "Q1_prompt": prompt1_with_q,
        "Q2_prompt": prompt2_with_q,
        'Q1_returned': ans1,
        'Q2_returned': ans2
    }

    return jsonres


In [31]:

# Load the combined dictionary from the JSON file
with open("data/prompts.json", "r") as json_file:
    all_prompts = json.load(json_file)

# Extract the individual dictionaries
chain_of_thought_prompt_dict = all_prompts["chain_of_thought_prompt_dict"]
self_ask_prompt_dict = all_prompts["self_ask_prompt_dict"]
direct_answer_prompt_dict = all_prompts["direct_answer_prompt_dict"]
subquestion_1_prompt_dict = all_prompts["subquestion_1_prompt_dict"]
subquestion_2_prompt_dict = all_prompts["subquestion_2_prompt_dict"]


# Now you can use the individual dictionaries in your script


In [39]:
i = 0
results = {
    "per_question_results": [],
    "summary": {
        "full_question_direct_answer_correct_EM": 0,
        "full_question_direct_answer_correct_CEM": 0,
        "full_question_chain_of_thought_correct_EM": 0,
        "full_question_chain_of_thought_correct_CEM": 0,
        "full_question_self_ask_correct_EM": 0,
        "full_question_self_ask_correct_CEM": 0,
        "both_subquestions_correct_EM": 0,
        "both_subquestions_correct_CEM": 0,
        "subquestions_wrong_when_full_question_direct_answer_correct_EM": 0,
        "subquestions_wrong_when_full_question_direct_answer_correct_CEM": 0,
        "subquestions_wrong_when_full_question_chain_of_thought_correct_EM": 0,
        "subquestions_wrong_when_full_question_chain_of_thought_correct_CEM": 0,
        "subquestions_wrong_when_full_question_self_ask_correct_EM": 0,
        "subquestions_wrong_when_full_question_self_ask_correct_CEM": 0,
        "full_question_direct_answer_wrong_when_subquestions_correct_EM": 0,
        "full_question_direct_answer_wrong_when_subquestions_correct_CEM": 0,
        "full_question_chain_of_thought_wrong_when_subquestions_correct_EM": 0,
        "full_question_chain_of_thought_wrong_when_subquestions_correct_CEM": 0,
        "full_question_self_ask_wrong_when_subquestions_correct_EM": 0,
        "full_question_self_ask_wrong_when_subquestions_correct_CEM": 0,
    }
}


In [37]:
model = "gpt-4"
while i < len(data):
    dp = data[i]
    try:
        category = dp['category']

        # Direct answer
        direct_answer_prompt = direct_answer_prompt_dict[category]
        direct_answer_result = run_LM_full_question(direct_answer_prompt, dp, model)


        # Chain of thought
        chain_of_thought_prompt = chain_of_thought_prompt_dict[category]
        chain_of_thought_result = run_LM_full_question(chain_of_thought_prompt, dp, model)


        # Self-ask
        self_ask_prompt = self_ask_prompt_dict[category]
        self_ask_result = run_LM_full_question(self_ask_prompt, dp, model, start='Are follow up questions needed here: Yes.\n')
        # print(self_ask_result["Prediction"])
        # print(self_ask_result["Answer"])

        # Subquestions
        subquestion_1_prompt = subquestion_1_prompt_dict[category]
        subquestion_2_prompt = subquestion_2_prompt_dict[category]
        subquestions_result = run_LM_subquestions(subquestion_1_prompt, subquestion_2_prompt, dp, model, extraction=extract_answer)

        if subquestions_result["Q1_and_Q2_right_EM"]:
            results["summary"]["both_subquestions_correct_EM"] += 1

            if not direct_answer_result["is_right_EM"]:
                results["summary"]["full_question_direct_answer_wrong_when_subquestions_correct_EM"] += 1

            if not chain_of_thought_result["is_right_EM"]:
                results["summary"]["full_question_chain_of_thought_wrong_when_subquestions_correct_EM"] += 1

            if not self_ask_result["is_right_EM"]:
                results["summary"]["full_question_self_ask_wrong_when_subquestions_correct_EM"] += 1

        if subquestions_result["Q1_and_Q2_right_CEM"]:
            results["summary"]["both_subquestions_correct_CEM"] += 1

            if not direct_answer_result["is_right_CEM"]:
                results["summary"]["full_question_direct_answer_wrong_when_subquestions_correct_CEM"] += 1

            if not chain_of_thought_result["is_right_CEM"]:
                results["summary"]["full_question_chain_of_thought_wrong_when_subquestions_correct_CEM"] += 1

            if not self_ask_result["is_right_CEM"]:
                results["summary"]["full_question_self_ask_wrong_when_subquestions_correct_CEM"] += 1

        if direct_answer_result["is_right_EM"]:
            results["summary"]["full_question_direct_answer_correct_EM"] += 1
            if not subquestions_result["Q1_and_Q2_right_EM"]:
                results["summary"]["subquestions_wrong_when_full_question_direct_answer_correct_EM"] += 1

        if direct_answer_result["is_right_CEM"]:
            results["summary"]["full_question_direct_answer_correct_CEM"] += 1
            if not subquestions_result["Q1_and_Q2_right_CEM"]:
                results["summary"]["subquestions_wrong_when_full_question_direct_answer_correct_CEM"] += 1

        if chain_of_thought_result["is_right_EM"]:
            results["summary"]["full_question_chain_of_thought_correct_EM"] += 1
            if not subquestions_result["Q1_and_Q2_right_EM"]:
                results["summary"]["subquestions_wrong_when_full_question_chain_of_thought_correct_EM"] += 1

        if chain_of_thought_result["is_right_CEM"]:
            results["summary"]["full_question_chain_of_thought_correct_CEM"] += 1
            if not subquestions_result["Q1_and_Q2_right_CEM"]:
                results["summary"]["subquestions_wrong_when_full_question_chain_of_thought_correct_CEM"] += 1

        if self_ask_result["is_right_EM"]:
            results["summary"]["full_question_self_ask_correct_EM"] += 1
            if not subquestions_result["Q1_and_Q2_right_EM"]:
                results["summary"]["subquestions_wrong_when_full_question_self_ask_correct_EM"] += 1

        if self_ask_result["is_right_CEM"]:
            results["summary"]["full_question_self_ask_correct_CEM"] += 1
            if not subquestions_result["Q1_and_Q2_right_CEM"]:
                results["summary"]["subquestions_wrong_when_full_question_self_ask_correct_CEM"] += 1
        results["per_question_results"].append({
            "subquestions_result": subquestions_result,
            "direct_answer_result": direct_answer_result,
            "chain_of_thought_result": chain_of_thought_result,
            "self_ask_result": self_ask_result
        })
        print(i)
        i += 1
    except Exception as e:
        print("error: ", e)

with open(f'results/CC_results_{model}_default_sysprompt.json', 'w') as outfile:
    json.dump(results, outfile, ensure_ascii=False, indent=4)


0
1
2
3
4
5
6
7
8
9


In [176]:
print("Results Summary:\n")
for key, value in results.items():
    print(f"{key.replace('_', ' ').capitalize()} : {value}")

Results Summary:

Full question direct answer correct em : 368
Full question direct answer correct cem : 378
Full question chain of thought correct em : 631
Full question chain of thought correct cem : 650
Full question self ask correct em : 601
Full question self ask correct cem : 623
Both subquestions correct em : 626
Both subquestions correct cem : 638
Subquestions wrong when full question direct answer correct em : 36
Subquestions wrong when full question direct answer correct cem : 36
Subquestions wrong when full question chain of thought correct em : 67
Subquestions wrong when full question chain of thought correct cem : 69
Subquestions wrong when full question self ask correct em : 56
Subquestions wrong when full question self ask correct cem : 62
Full question direct answer wrong when subquestions correct em : 294
Full question direct answer wrong when subquestions correct cem : 296
Full question chain of thought wrong when subquestions correct em : 62
Full question chain of th

In [157]:
with open(f'results/CC_results_{model}.json', 'w') as outfile:
    json.dump(results, outfile, ensure_ascii=False, indent=4)