In [18]:
import openai
import urllib.request, json 
import string, re
import random
import time
import traceback


with open('API_key.txt', 'r') as file:
    openai.api_key = file.readline().strip()

In [121]:
data_path = "data/all_formats.json"
with open(data_path, 'r') as f:
    data = json.load(f)
len(data)

154

In [148]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(str(s)))))

def extract_answer(generated):
    generated = generated.lower()
    if 'final answer:' in generated:
        after_colon = generated.split('final answer:')[-1]
        if "\n" in after_colon:
            after_colon = after_colon.split("\n")[0]
    elif ":" in generated:
        after_colon = generated.split(':')[-1]
        if "\n" in after_colon:
            after_colon = after_colon.split("\n")[0]
    else:
        after_colon = generated
    return normalize_answer(after_colon)

In [149]:
system_prompt = f"You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.\nKnowledge cutoff: 2021-09\nCurrent date: 2023-04-18"

In [150]:
def run_LM_on_expression(prompt, sample, current_model, extract_answer, max_tokens=300):
    question = sample['Question'] + " ="
    # cur_prompt = prompt_prefix + prompt + '\n' + '\n' + 'Question: ' + question + '\n' + start
    cur_prompt = prompt + ' ' + question

    if current_model in ["gpt-3.5-turbo", "gpt-4"]:
        ans = openai.ChatCompletion.create(
            model=current_model,
            max_tokens=max_tokens,
            # stop='\n\n',
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": cur_prompt}
            ]
        )
        response_text = ans['choices'][0]["message"]["content"]
    else:
        ans = openai.Completion.create(
            model=current_model,
            max_tokens=max_tokens,
            # stop='\n\n',
            prompt=cur_prompt,
            temperature=0
        )
        response_text = ans['choices'][0]['text']
    is_right = False
    try:
        cleaned_answer = extract_answer(response_text)
        is_right = int(cleaned_answer) == sample["Answer"]
    except Exception as e:
        print(cur_prompt)
        print(response_text)
        print(e)
    jsonres = {
        "question": sample['Question'],
        "prompt": cur_prompt,
        "answer": sample['Answer'],
        "returned": response_text,
        "is_right": is_right
    }
    return jsonres

In [152]:
def update_results_dict(results, prompt_results, prompt_types):

    for key in prompt_types:
        if prompt_results[key][f"is_right"]:
            results["summary"][f"{key}_correct"] += 1

    results["per_question_results"].append(prompt_results)


In [151]:
formats = [' + '.join(['#'] * (i+2)) for i in range(0, 10)]
# formats = ["# + # * # + #"]

prompts = {
    "reasoning_ok": "Answer the following math problem, formatting your final answer as \"final answer: <number>\". You may show your work. ", 
    "direct_answer": "Answer the following math problem, final answer (number) only, NO WORDS. "
}
results = {
    f"results_format_({format})": {
        "per_question_results": [],
        "summary": {
            f"{prompt_type}_correct": 0 for prompt_type in prompts.keys()
        }
    } for format in formats
}

print(results)


print(formats)

{
    "per_question_results": [],
    "summary": {
        "direct_answer_correct": 0,
        "unprompted_correct": 0
    }
}


{'results_format_(# + #)': {'per_question_results': [], 'summary': {'reasoning_ok_correct': 0, 'direct_answer_correct': 0}}, 'results_format_(# + # + #)': {'per_question_results': [], 'summary': {'reasoning_ok_correct': 0, 'direct_answer_correct': 0}}, 'results_format_(# + # + # + #)': {'per_question_results': [], 'summary': {'reasoning_ok_correct': 0, 'direct_answer_correct': 0}}, 'results_format_(# + # + # + # + #)': {'per_question_results': [], 'summary': {'reasoning_ok_correct': 0, 'direct_answer_correct': 0}}, 'results_format_(# + # + # + # + # + #)': {'per_question_results': [], 'summary': {'reasoning_ok_correct': 0, 'direct_answer_correct': 0}}, 'results_format_(# + # + # + # + # + # + #)': {'per_question_results': [], 'summary': {'reasoning_ok_correct': 0, 'direct_answer_correct': 0}}, 'results_format_(# + # + # + # + # + # + # + #)': {'per_question_results': [], 'summary': {'reasoning_ok_correct': 0, 'direct_answer_correct': 0}}, 'results_format_(# + # + # + # + # + # + # + # 

{'per_question_results': [],
 'summary': {'direct_answer_correct': 0, 'unprompted_correct': 0}}

In [132]:
model = "gpt-3.5-turbo"

In [155]:
for format in formats:
    i = 0
    while i < len(data):
        dp = data[format][i]
        try:
            prompt_results = {}
            for key in prompts.keys():
                prompt = prompts[key]
                prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer)
            # Update results
            update_results_dict(results[f"results_format_({format})"], prompt_results, prompt_types)

            # only run once
            print(i)
            i += 1
        except Exception as e:
            # print("error: ", e)
            traceback.print_exc()
save()


0
1


Traceback (most recent call last):
  File "/tmp/ipykernel_18609/837454402.py", line 9, in <module>
    prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer)
  File "/tmp/ipykernel_18609/1145792516.py", line 7, in run_LM_on_expression
    ans = openai.ChatCompletion.create(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 226, in request
    resp, got_stream = self._interpret_response(result, stream)
  File "/home/loganmilandin/.local/lib/python3.9/site-packages/openai/api_requestor.py", line 620, in _interpret_response
    self._interpret_response_line(
  Fil

2
Answer the following math problem, formatting your final answer as "final answer: <number>". You may show your work.  13 + 93 =
Sure, here's how to solve the problem:

13 + 93 = (10 + 3) + (90 + 3)   // break apart the numbers by place value
         = 10 + 90 + 3 + 3       // regroup and simplify
         = 100 + 6               // regroup and simplify again
         = 106                   // final answer

Therefore, the final answer is 106.
invalid literal for int() with base 10: '13 93 10 3 90 3 break apart numbers by place value 10 90 3 3 regroup and simplify 100 6 regroup and simplify again 106 final answer therefore final answer is 106'
3
4
5
6
7
8
9
10
11
12
13
14
Answer the following math problem, formatting your final answer as "final answer: <number>". You may show your work.  0 + 43 =
Final answer: 43

Explanation:
There's no need to show any work for this particular math problem, since it's a very simple addition operation between 0 and 43. The answer is simply 43.
inval

In [74]:
results_file = f'results/arithmetic/addition_only_results_{model}.json'
results_file

'results/arithmetic/addition_only_results_gpt-3.5-turbo.json'

In [154]:
def save():
    with open(results_file, 'w') as outfile:
        json.dump(results, outfile, ensure_ascii=False, indent=4)
save()