In [2]:
import openai
import urllib.request, json 
import string, re
import random
import time
import traceback


with open('API_key.txt', 'r') as file:
    openai.api_key = file.readline().strip()

In [20]:
data_path = "data/addition_multiplication_mixed_small_numbers.json"
with open(data_path, 'r') as f:
    data = json.load(f)
len(data)
data.keys()

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])

In [13]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(str(s)))))

def extract_answer(generated):
    generated = generated.lower()
    if 'final answer:' in generated:
        after_colon = generated.split('final answer:')[-1]
        if "\n" in after_colon:
            after_colon = after_colon.split("\n")[0]
    elif ":" in generated:
        after_colon = generated.split(':')[-1]
        if "\n" in after_colon:
            after_colon = after_colon.split("\n")[0]
    else:
        after_colon = generated
    return normalize_answer(after_colon)

In [5]:
system_prompt = f"You are ChatGPT, a large language model trained by OpenAI, based on the GPT-3.5 architecture.\nKnowledge cutoff: 2021-09\nCurrent date: 2023-05-17"

In [6]:
def run_LM_on_expression(prompt, sample, current_model, extract_answer, max_tokens=600, start=" ="):
    question = sample['Question'] + start
    # cur_prompt = prompt_prefix + prompt + '\n' + '\n' + 'Question: ' + question + '\n' + start
    cur_prompt = prompt + ' ' + question

    if current_model in ["gpt-3.5-turbo", "gpt-4"]:
        ans = openai.ChatCompletion.create(
            model=current_model,
            max_tokens=max_tokens,
            # stop='\n\n',
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": cur_prompt}
            ]
        )
        response_text = ans['choices'][0]["message"]["content"]
    else:
        ans = openai.Completion.create(
            model=current_model,
            max_tokens=max_tokens,
            # stop='\n\n',
            prompt=cur_prompt,
            temperature=0
        )
        response_text = ans['choices'][0]['text']
    is_right = False
    try:
        if prompt == "":
            is_right = str(sample["Answer"]) in normalize_answer(response_text)
        else:
            cleaned_answer = extract_answer(response_text)
            is_right = int(cleaned_answer) == sample["Answer"]

    except Exception as e:
        print(cur_prompt)
        print(response_text)
        print(e)
    jsonres = {
        "question": sample['Question'],
        "prompt": cur_prompt,
        "answer": sample['Answer'],
        "returned": response_text,
        "is_right": is_right,
        "is_right_CEM": str(sample["Answer"]) in normalize_answer(response_text)
    }
    return jsonres

In [7]:
def update_results_dict(results, prompt_results, prompt_types):

    for key in prompt_types:
        if prompt_results[key][f"is_right"]:
            results["summary"][f"{key}_correct"] += 1
        if prompt_results[key][f"is_right_CEM"]:
            results["summary"][f"{key}_CEM_correct"] += 1

    results["per_question_results"].append(prompt_results)


In [28]:
num_operators = [str(i) for i in range (1, 11)]
# formats = ["# + # * # + #"]

prompts = {
    "unprompted": "", 
    "chain_of_thought": "Answer the following math problem, showing each step and stating your final answer in the form \"final answer: <number>\" on a new line.", 
    "direct_answer": "Answer the following math problem, final answer (number) only, NO WORDS. "
}
results = {
    f"results_{num}_operators": {
        "per_question_results": [],
        "summary": {
            **{f"{prompt_type}_correct": 0 for prompt_type in prompts.keys()}, **{f"{prompt_type}_CEM_correct": 0 for prompt_type in prompts.keys()}
        }
    } for num in num_operators
}

print(num_operators)

{
    "per_question_results": [],
    "summary": {
        "direct_answer_correct": 0,
        "unprompted_correct": 0
    }
}


['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']


{'per_question_results': [],
 'summary': {'direct_answer_correct': 0, 'unprompted_correct': 0}}

In [16]:
model = "gpt-3.5-turbo"

In [17]:
j

0

In [29]:
j = 0
while j < len(num_operators):
    num = num_operators[j]
    
    i = 0
    while i < len(data[num]):
        dp = data[num][i]
        try:
            prompt_results = {}
            for key in prompts.keys():
                prompt = prompts[key]
                prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer)
            # Update results
            update_results_dict(results[f"results_{num}_operators"], prompt_results, prompts.keys())

            # only run once
            print(i)
            i += 1
        except Exception as e:
            # print("error: ", e)
            traceback.print_exc()
    save()
    j += 1

    


0
1
2
3
4
5
6
Answer the following math problem, showing each step and stating your final answer in the form "final answer: <number>" on a new line. 18 * 18 =
Sure, I can help you with that math problem!

To solve 18 * 18, we can use the multiplication operation, which means we need to multiply the first 18 by the second 18. To do this, we can use the following steps:

1. Multiply 8 (the ones digit of the first 18) by 8 (the ones digit of the second 18):
   - 8 * 8 = 64, so we write down the 4 and carry the 6.

2. Multiply 1 (the tens digit of the first 18) by 8:
   - 1 * 8 = 8, and then we add the carried 6 from the previous step:
   - 8 + 6 = 14, so we write down the 4 and carry the 1.

3. Multiply 8 (the ones digit of the first 18) by 1 (the tens digit of the second 18):
   - 8 * 1 = 8, and then we add the carried 1 from the previous step:
   - 8 + 1 = 9, so we write down the 9.

4. Multiply 1 (the tens digit of the first 18) by 1 (the tens digit of the second 18):
   - 1 * 1 = 1, s

KeyboardInterrupt: 

In [25]:
results_file = f'results/arithmetic/addition_multiplication_mixed_small_numbers_results_{model}.json'
results_file

'results/arithmetic/addition_multiplication_mixed_small_numbers_results_gpt-3.5-turbo.json'

In [26]:
def save():
    with open(results_file, 'w') as outfile:
        json.dump(results, outfile, ensure_ascii=False, indent=4)
save()