In [18]:
import openai
import urllib.request, json 
import string, re
import random
import time
import traceback


with open('API_key.txt', 'r') as file:
    openai.api_key = file.readline().strip()

In [121]:
data_path = "data/all_formats.json"
with open(data_path, 'r') as f:
    data = json.load(f)
len(data)

154

In [199]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(str(s)))))

def extract_answer(generated):
    generated = generated.lower()
    if 'final answer:' in generated:
        after_colon = generated.split('final answer:')[-1]
        if "\n" in after_colon:
            after_colon = after_colon.split("\n")[0]
    elif ":" in generated:
        after_colon = generated.split(':')[-1]
        if "\n" in after_colon:
            after_colon = after_colon.split("\n")[0]
    else:
        after_colon = generated
    return normalize_answer(after_colon)

In [198]:
system_prompt = f"You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.\nKnowledge cutoff: 2021-09\nCurrent date: 2023-04-18"

In [197]:
def run_LM_on_expression(prompt, sample, current_model, extract_answer, max_tokens=300, start=" ="):
    question = sample['Question'] + start
    # cur_prompt = prompt_prefix + prompt + '\n' + '\n' + 'Question: ' + question + '\n' + start
    cur_prompt = prompt + ' ' + question

    if current_model in ["gpt-3.5-turbo", "gpt-4"]:
        ans = openai.ChatCompletion.create(
            model=current_model,
            max_tokens=max_tokens,
            # stop='\n\n',
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": cur_prompt}
            ]
        )
        response_text = ans['choices'][0]["message"]["content"]
    else:
        ans = openai.Completion.create(
            model=current_model,
            max_tokens=max_tokens,
            # stop='\n\n',
            prompt=cur_prompt,
            temperature=0
        )
        response_text = ans['choices'][0]['text']
    is_right = False
    try:
        if prompt == "":
            is_right = str(sample["Answer"]) in normalize_answer(response_text)
        else:
            cleaned_answer = extract_answer(response_text)
            is_right = int(cleaned_answer) == sample["Answer"]

    except Exception as e:
        print(cur_prompt)
        print(response_text)
        print(e)
    jsonres = {
        "question": sample['Question'],
        "prompt": cur_prompt,
        "answer": sample['Answer'],
        "returned": response_text,
        "is_right": is_right,
        "is_right_CEM": str(sample["Answer"]) in normalize_answer(response_text)
    }
    return jsonres

In [196]:
def update_results_dict(results, prompt_results, prompt_types):

    for key in prompt_types:
        if prompt_results[key][f"is_right"]:
            results["summary"][f"{key}_correct"] += 1
        if prompt_results[key][f"is_right_CEM"]:
            results["summary"][f"{key}_CEM_correct"] += 1

    results["per_question_results"].append(prompt_results)


In [222]:
formats = [' * '.join(['#'] * (i+2)) for i in range(0, 10)]
# formats = ["# + # * # + #"]

prompts = {
    "unprompted": "", 
    # "reasoning_ok": "Answer the following math problem, formatting your final answer as \"final answer: <number>\". You may show your work. ", 
    "direct_answer": "Answer the following math problem, final answer (number) only, NO WORDS. "
}
results = {
    f"results_format_({format})": {
        "per_question_results": [],
        "summary": {
            **{f"{prompt_type}_correct": 0 for prompt_type in prompts.keys()}, **{f"{prompt_type}_CEM_correct": 0 for prompt_type in prompts.keys()}
        }
    } for format in formats
}

print(formats)

{
    "per_question_results": [],
    "summary": {
        "direct_answer_correct": 0,
        "unprompted_correct": 0
    }
}


['# * #', '# * # * #', '# * # * # * #', '# * # * # * # * #', '# * # * # * # * # * #', '# * # * # * # * # * # * #', '# * # * # * # * # * # * # * #', '# * # * # * # * # * # * # * # * #', '# * # * # * # * # * # * # * # * # * #', '# * # * # * # * # * # * # * # * # * # * #']


{'per_question_results': [],
 'summary': {'direct_answer_correct': 0, 'unprompted_correct': 0}}

In [223]:
model = "gpt-3.5-turbo"

In [224]:
for format in formats:
    i = 0

    while i < len(data[format]):
        dp = data[format][i]
        try:
            prompt_results = {}
            for key in prompts.keys():
                prompt = prompts[key]
                prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer)
            # Update results
            update_results_dict(results[f"results_format_({format})"], prompt_results, prompts.keys())

            # only run once
            print(i)
            i += 1
        except Exception as e:
            # print("error: ", e)
            traceback.print_exc()
    save()
    


Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 665, in urlopen
    httplib_response = self._make_request(
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 421, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 416, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/usr/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/usr/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/usr/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/usr/lib/python3.9/ssl.py", line 1242, in recv_into
    return self.read(nbytes, buffer)
  File 

0
1
2


Traceback (most recent call last):
  File "/tmp/ipykernel_18609/1560108208.py", line 10, in <module>
    prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer)
  File "/tmp/ipykernel_18609/3439787225.py", line 7, in run_LM_on_expression
    ans = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 226, in request
    resp, got_stream = self._interpret_response(result, stream)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 620, in _interpret_response
    self._interpret_response_line(
  F

3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


Traceback (most recent call last):
  File "/tmp/ipykernel_18609/1560108208.py", line 10, in <module>
    prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer)
  File "/tmp/ipykernel_18609/3439787225.py", line 7, in run_LM_on_expression
    ans = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 226, in request
    resp, got_stream = self._interpret_response(result, stream)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 620, in _interpret_response
    self._interpret_response_line(
  F

21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81


Traceback (most recent call last):
  File "/tmp/ipykernel_18609/1560108208.py", line 10, in <module>
    prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer)
  File "/tmp/ipykernel_18609/3439787225.py", line 7, in run_LM_on_expression
    ans = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 226, in request
    resp, got_stream = self._interpret_response(result, stream)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 620, in _interpret_response
    self._interpret_response_line(
  F

82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
Answer the following math problem, final answer (number) only, NO WORDS.  86 * 58 * 61 =
298, 748
invalid literal for int() with base 10: '298 748'
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
Answer the following math problem, final answer (number) only, NO WORDS.  38 * 91 * 27 =
This mathematical operation yields 91,158.
invalid literal for int() with base 10: 'this mathematical operation yields 91158'
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11


Traceback (most recent call last):
  File "/tmp/ipykernel_18609/1560108208.py", line 10, in <module>
    prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer)
  File "/tmp/ipykernel_18609/3439787225.py", line 7, in run_LM_on_expression
    ans = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 226, in request
    resp, got_stream = self._interpret_response(result, stream)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 620, in _interpret_response
    self._interpret_response_line(
  F

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
Answer the following math problem, final answer (number) only, NO WORDS.  67 * 50 * 34 * 20 =
Please find the solution below:

23,044,000
invalid literal for int() with base 10: ''
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
Answer the following math problem, final answer (number) only, NO WO

Traceback (most recent call last):
  File "/tmp/ipykernel_18609/1560108208.py", line 10, in <module>
    prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer)
  File "/tmp/ipykernel_18609/3439787225.py", line 7, in run_LM_on_expression
    ans = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 226, in request
    resp, got_stream = self._interpret_response(result, stream)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 620, in _interpret_response
    self._interpret_response_line(
  F

70
71
72
73
74
75
76
77
78
79
80


Traceback (most recent call last):
  File "/tmp/ipykernel_18609/1560108208.py", line 10, in <module>
    prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer)
  File "/tmp/ipykernel_18609/3439787225.py", line 7, in run_LM_on_expression
    ans = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 226, in request
    resp, got_stream = self._interpret_response(result, stream)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 620, in _interpret_response
    self._interpret_response_line(
  F

81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28


In [220]:
results_file = f'results/arithmetic/multiplication_only_results_{model}.json'
results_file

'results/arithmetic/multiplication_only_results_gpt-3.5-turbo.json'

In [221]:
def save():
    with open(results_file, 'w') as outfile:
        json.dump(results, outfile, ensure_ascii=False, indent=4)
save()