In [229]:
import openai
import urllib.request, json 
import string, re
import random
import time
import traceback


with open('API_key.txt', 'r') as file:
    openai.api_key = file.readline().strip()

In [230]:
data_path = "data/all_formats_small_numbers.json"
with open(data_path, 'r') as f:
    data = json.load(f)
len(data)

144

In [231]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(str(s)))))

def extract_answer(generated):
    generated = generated.lower()
    if 'final answer:' in generated:
        after_colon = generated.split('final answer:')[-1]
        if "\n" in after_colon:
            after_colon = after_colon.split("\n")[0]
    elif ":" in generated:
        after_colon = generated.split(':')[-1]
        if "\n" in after_colon:
            after_colon = after_colon.split("\n")[0]
    else:
        after_colon = generated
    return normalize_answer(after_colon)

In [238]:
system_prompt = f"You are ChatGPT, a large language model trained by OpenAI, based on the GPT-3.5 architecture.\nKnowledge cutoff: 2021-09\nCurrent date: 2023-05-17"

In [239]:
def run_LM_on_expression(prompt, sample, current_model, extract_answer, max_tokens=300, start=" ="):
    question = sample['Question'] + start
    # cur_prompt = prompt_prefix + prompt + '\n' + '\n' + 'Question: ' + question + '\n' + start
    cur_prompt = prompt + ' ' + question

    if current_model in ["gpt-3.5-turbo", "gpt-4"]:
        ans = openai.ChatCompletion.create(
            model=current_model,
            max_tokens=max_tokens,
            # stop='\n\n',
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": cur_prompt}
            ]
        )
        response_text = ans['choices'][0]["message"]["content"]
    else:
        ans = openai.Completion.create(
            model=current_model,
            max_tokens=max_tokens,
            # stop='\n\n',
            prompt=cur_prompt,
            temperature=0
        )
        response_text = ans['choices'][0]['text']
    is_right = False
    try:
        if prompt == "":
            is_right = str(sample["Answer"]) in normalize_answer(response_text)
        else:
            cleaned_answer = extract_answer(response_text)
            is_right = int(cleaned_answer) == sample["Answer"]

    except Exception as e:
        print(cur_prompt)
        print(response_text)
        print(e)
    jsonres = {
        "question": sample['Question'],
        "prompt": cur_prompt,
        "answer": sample['Answer'],
        "returned": response_text,
        "is_right": is_right,
        "is_right_CEM": str(sample["Answer"]) in normalize_answer(response_text)
    }
    return jsonres

In [240]:
def update_results_dict(results, prompt_results, prompt_types):

    for key in prompt_types:
        if prompt_results[key][f"is_right"]:
            results["summary"][f"{key}_correct"] += 1
        if prompt_results[key][f"is_right_CEM"]:
            results["summary"][f"{key}_CEM_correct"] += 1

    results["per_question_results"].append(prompt_results)


In [252]:
formats = [' + '.join(['#'] * (i+2)) for i in range(0, 10)]
# formats = ["# + # * # + #"]

prompts = {
    "unprompted": "", 
    "chain_of_thought": "Answer the following math problem, showing each step and stating your final answer in the form \"final answer: <number>\" on a new line.", 
    "direct_answer": "Answer the following math problem, final answer (number) only, NO WORDS. "
}
results = {
    f"results_format_({format})": {
        "per_question_results": [],
        "summary": {
            **{f"{prompt_type}_correct": 0 for prompt_type in prompts.keys()}, **{f"{prompt_type}_CEM_correct": 0 for prompt_type in prompts.keys()}
        }
    } for format in formats
}

print(formats)

{
    "per_question_results": [],
    "summary": {
        "direct_answer_correct": 0,
        "unprompted_correct": 0
    }
}


['# + #', '# + # + #', '# + # + # + #', '# + # + # + # + #', '# + # + # + # + # + #', '# + # + # + # + # + # + #', '# + # + # + # + # + # + # + #', '# + # + # + # + # + # + # + # + #', '# + # + # + # + # + # + # + # + # + #', '# + # + # + # + # + # + # + # + # + # + #']


{'per_question_results': [],
 'summary': {'direct_answer_correct': 0, 'unprompted_correct': 0}}

In [253]:
model = "gpt-3.5-turbo"

In [254]:
j

0

In [251]:
j = 0
while j < len(formats):
    format = format[j]
    
    i = 0
    while i < len(data[format]):
        dp = data[format][i]
        try:
            prompt_results = {}
            for key in prompts.keys():
                prompt = prompts[key]
                prompt_results[key] = run_LM_on_expression(prompt, dp, model, extract_answer=extract_answer)
            # Update results
            update_results_dict(results[f"results_format_({format})"], prompt_results, prompts.keys())

            # only run once
            print(i)
            i += 1
        except Exception as e:
            # print("error: ", e)
            traceback.print_exc()
    save()
    j += 1
    


NameError: name 'num' is not defined

In [255]:
results_file = f'results/arithmetic/addition_only_small_numbers_results_{model}.json'
results_file

'results/arithmetic/multiplication_only_small_numbers_results_gpt-3.5-turbo.json'

In [245]:
def save():
    with open(results_file, 'w') as outfile:
        json.dump(results, outfile, ensure_ascii=False, indent=4)
save()