In [2]:
import openai
import urllib.request, json 
import string, re
import random
import time
import traceback


with open('API_key.txt', 'r') as file:
    openai.api_key = file.readline().strip()

In [3]:
data_path = "data/all_formats.json"
with open(data_path, 'r') as f:
    data = json.load(f)
len(data)

154

In [4]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(str(s)))))

def extract_answer(generated):
    generated = generated.lower()
    if 'final answer:' in generated:
        after_colon = generated.split('final answer:')[-1]
        if "\n" in after_colon:
            after_colon = after_colon.split("\n")[0]
    elif ":" in generated:
        after_colon = generated.split(':')[-1]
        if "\n" in after_colon:
            after_colon = after_colon.split("\n")[0]
    else:
        after_colon = generated
    return normalize_answer(after_colon)

In [5]:
system_prompt = f"You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.\nKnowledge cutoff: 2021-09\nCurrent date: 2023-04-18"

In [6]:
def run_LM_on_expression(prompt, sample, current_model, extract_answer, max_tokens=300, start=" ="):
    question = sample['Question'] + start
    # cur_prompt = prompt_prefix + prompt + '\n' + '\n' + 'Question: ' + question + '\n' + start
    cur_prompt = prompt + ' ' + question

    if current_model in ["gpt-3.5-turbo", "gpt-4"]:
        ans = openai.ChatCompletion.create(
            model=current_model,
            max_tokens=max_tokens,
            # stop='\n\n',
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": cur_prompt}
            ]
        )
        response_text = ans['choices'][0]["message"]["content"]
    else:
        ans = openai.Completion.create(
            model=current_model,
            max_tokens=max_tokens,
            # stop='\n\n',
            prompt=cur_prompt,
            temperature=0
        )
        response_text = ans['choices'][0]['text']
    is_right = False
    try:
        if prompt == "":
            is_right = str(sample["Answer"]) in normalize_answer(response_text)
        else:
            cleaned_answer = extract_answer(response_text)
            is_right = int(cleaned_answer) == sample["Answer"]

    except Exception as e:
        print(cur_prompt)
        print(response_text)
        print(e)
    jsonres = {
        "question": sample['Question'],
        "prompt": cur_prompt,
        "answer": sample['Answer'],
        "returned": response_text,
        "is_right": is_right,
        "is_right_CEM": str(sample["Answer"]) in normalize_answer(response_text)
    }
    return jsonres

In [7]:
def update_results_dict(results, prompt_results, prompt_types):

    for key in prompt_types:
        if prompt_results[key][f"is_right"]:
            results["summary"][f"{key}_correct"] += 1
        if prompt_results[key][f"is_right_CEM"]:
            results["summary"][f"{key}_CEM_correct"] += 1

    results["per_question_results"].append(prompt_results)


In [8]:
# formats = [' * '.join(['#'] * (i+2)) for i in range(0, 10)]
formats = ["# + # + #", "# * # * #"]

prompts = {
    # "unprompted": "", 
    # "reasoning_ok": "Answer the following math problem, formatting your final answer as \"final answer: <number>\". You may show your work. ", 
    "direct_answer": "Answer the following math problem, final answer (number) only, NO WORDS. "
}
results = {
    f"results_format_({format})": {
        "per_question_results": [],
        "summary": {
            **{f"{prompt_type}_correct": 0 for prompt_type in prompts.keys()}, **{f"{prompt_type}_CEM_correct": 0 for prompt_type in prompts.keys()}
        }
    } for format in formats
}

print(formats)

{
    "per_question_results": [],
    "summary": {
        "direct_answer_correct": 0,
        # "unprompted_correct": 0
    }
}


['# + # + #', '# * # * #']


{'per_question_results': [], 'summary': {'direct_answer_correct': 0}}

In [9]:
model = "gpt-3.5-turbo"

In [10]:
wp_path = "data/add_mult_only_3.json"
with open(wp_path, 'r') as f:
    wps = json.load(f)
len(wps)

2

In [11]:
for format in formats:
    i = 0

    while i < len(data[format]):

        dp = data[format][i]
        ns = dp['Question'].split(' + ')
        dp['Question'] = wps[format][i % 5].replace('###', ns[2]).replace('##', ns[1]).replace('#', ns[0])

        try:
            prompt_results = {}
            for key in prompts.keys():
                prompt = prompts[key]
                prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer, start="")
            # Update results
            update_results_dict(results[f"results_format_({format})"], prompt_results, prompts.keys())

            # only run once
            print(i)
            i += 1
        except Exception as e:
            # print("error: ", e)
            traceback.print_exc()
    # save()
    


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


NameError: name 'save' is not defined

In [13]:
results_file = f'results/arithmetic/word_problem_only_results_{model}.json'
results_file

'results/arithmetic/word_problem_only_results_gpt-3.5-turbo.json'

In [15]:
def save():
    with open(results_file, 'w') as outfile:
        json.dump(results, outfile, ensure_ascii=False, indent=4)
save()