In [None]:
# TASK SETUP
LOG_PATH = 'logs/'
DATA_PATH = 'data/'
RUNS_PER_QUERY = 20
API_KEY = ''  # ADD API KEY
CATEGORIES = ['algebra',
              'counting_and_probability', 
              'geometry', 'number_theory', 
              'precalculus', 
              'prealgebra', 
              'intermediate_algebra']
TEMPERATURE = 0.7
MAX_TOKENS = None
MODEL = 'gpt-3.5-turbo'


COT_PROMPT = """Solve the following math problem. Perform the solution by going step by step, and write down each step.

<Example>
Question: Let \\[f(x) = \\left\\{{\\n\\begin{{array}}{{cl}} ax+3, &\\text{{ if }}x>2, \\\\\\nx-5 &\\text{{ if }} -2 \\le x \\le 2, \\\\\\n2x-b &\\text{{ if }} x <-2.\\n\\end{{array}}\\n\\right.\\]Find $a+b$ if the piecewise function is continuous (which means that its graph can be drawn without lifting your pencil from the paper). Let's think step by step.
<Solution>
Step 1: For the piecewise function to be continuous, the cases must "meet" at $2$ and $-2$. For example, $ax+3$ and $x-5$ must be equal when $x=2$.
Step 2: This implies $a(2)+3=2-5$, which we solve to get $2a=-6 \\Rightarrow a=-3$.
Step 3: Similarly, $x-5$ and $2x-b$ must be equal when $x=-2$.
Step 4: Substituting, we get $-2-5=2(-2)-b$, which implies $b=3$. So $a+b=-3+3.
Final answer: \\boxed{{0}}$.
</Solution> 
</Example>

<Example>
Question: Sixteen is 64$\\%$ of what number? Let's think step by step.
<Solution>
Step 1: If the number is $x$, we can set up the equation $\\frac{{16}}{{x}}=\\frac{{64}}{{100}}$.
Step 2: We divide both sides by $4$ to get $\\frac{{1}}{{x}}=\\frac{{4}}{{100}}=\\frac{{1}}{{25}}$, so $x=\\boxed{{25}}$.
Final answer: \\boxed{{25}}$
</Solution> If the number is $x$, we can set up the equation $\\frac{{16}}{{x}}=\\frac{{64}}{{100}}$. We divide both sides by $4$ to get $\\frac{{1}}{{x}}=\\frac{{4}}{{100}}=\\frac{{1}}{{25}}$, so $x=\\boxed{{25}}$.
</Example>

<Example>
Question: There are 3 complex numbers $a+bi$, $c+di$, and $e+fi$. If $b=1$, $e=-a-c$, and the sum of the numbers is $-i$, find $d+f$. Let's think step by step.
<Solution>
Step 1: We know that $a+bi+c+di+e+fi=-i$. Thus, the real parts add up to 0 and the imaginary parts add up to -1.
Step 2: We then have  \\begin{{align}}\\na+c+e&=0\\\\\\nb+d+f&=-1\\\\\\n\\end{{align}}
Step 3: We know that $b=1$, therefore $d+f=\\boxed{{-2}}$
Final answer: \\boxed{{-2}}$
</Solution> We know that $a+bi+c+di+e+fi=-i$. Thus, the real parts add up to 0 and the imaginary parts add up to -1. We then have  \\begin{{align}}\\na+c+e&=0\\\\\\nb+d+f&=-1\\\\\\n\\end{{align}}We know that $b=1$, therefore $d+f=\\boxed{{-2}}$
</Example>

<Task>
Question: {question} Let's think step by step. Explain the reasoning for the final answer like the examples. Do not skip step numbers. Do not write down the question. While giving next steps, start by writing the current step number like given format. Do not write previous steps given in answer.
<Solution>

</Solution> 
</Task>
"""

In [None]:
# from data_parser import DataParser
import concurrent.futures
import openai
import json
import os

client = openai.OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=API_KEY,
)

In [None]:
def get_cot_samples(idx, question, answer):
    log_filename = LOG_PATH + category_name + f'/{idx}.json'

    prompt = COT_PROMPT.format(question=question)
    messages=[{"role": "user", "content": prompt}]
    result = client.chat.completions.create(messages=messages, model=MODEL, temperature=TEMPERATURE, n=RUNS_PER_QUERY, max_tokens=MAX_TOKENS)
    
    log = {
        'choices':[choice.message.content for choice in result.choices],
        'usage': result.usage.dict(),
        'correct': answer
    }
    log_file = open(log_filename, 'w')
    json.dump(log, log_file, indent=2)
    log_file.close()

def run_get_cot_samples(args):
    idx, question, answer = args
    get_cot_samples(idx, question, answer)

def extract_answer(answer_raw):
    answer = answer_raw.replace(",", "")
    answer = [s for s in re.findall(r'-?\d+\.?\d*', answer)]
    if answer:
        return answer[0]
    return answer_raw.strip()


def find_answer(solution):
    # Finds final answer from string using //boxed higlight in LaTeX.
    start = solution.rfind("boxed") + 6
    end = start + 1
    braketCounter = 0
    # Tries to find the end of //boxed{} area
    while end < len(solution) and (solution[end] != '}' or braketCounter != 0):
        if solution[end] == '{':
            braketCounter += 1
        if solution[end] == '}':
            braketCounter -= 1
        end += 1
        # Fail safe for misused boxed at the end
    if end == len(solution):
        return "ERROR"
    return solution[start:end]


In [None]:
# ALGEBRA
category_index = 0
category_name = CATEGORIES[category_index]
os.makedirs(LOG_PATH+category_name,exist_ok=True)

data_parser = DataParser(DATA_PATH)
data_parser.loadResults('level_5', category_name)

    
with concurrent.futures.ThreadPoolExecutor() as executor:
    args_list = [(idx, question, answer) for idx, (question, answer) in enumerate(zip(data_parser.questionsList, data_parser.solutionList))]
    executor.map(run_get_cot_samples, args_list)
    

In [None]:
# COUNTING AND PROBABILITY
category_index = 1
category_name = CATEGORIES[category_index]
os.makedirs(LOG_PATH+category_name,exist_ok=True)

data_parser = DataParser(DATA_PATH)
data_parser.loadResults('level_5', category_name)

    
with concurrent.futures.ThreadPoolExecutor() as executor:
    args_list = [(idx, question, answer) for idx, (question, answer) in enumerate(zip(data_parser.questionsList, data_parser.solutionList))]
    executor.map(run_get_cot_samples, args_list)



In [None]:
# GEOMETRY
category_index = 2
category_name = CATEGORIES[category_index]
os.makedirs(LOG_PATH+category_name,exist_ok=True)

data_parser = DataParser(DATA_PATH)
data_parser.loadResults('level_5', category_name)

    
with concurrent.futures.ThreadPoolExecutor() as executor:
    args_list = [(idx, question, answer) for idx, (question, answer) in enumerate(zip(data_parser.questionsList, data_parser.solutionList))]
    executor.map(run_get_cot_samples, args_list)

In [None]:
# NUMBER THEORY
category_index = 3
category_name = CATEGORIES[category_index]
os.makedirs(LOG_PATH+category_name,exist_ok=True)

data_parser = DataParser(DATA_PATH)
data_parser.loadResults('level_5', category_name)

    
with concurrent.futures.ThreadPoolExecutor() as executor:
    args_list = [(idx, question, answer) for idx, (question, answer) in enumerate(zip(data_parser.questionsList, data_parser.solutionList))]
    executor.map(run_get_cot_samples, args_list)

In [None]:
# PRECALCULUS
category_index = 4
category_name = CATEGORIES[category_index]
os.makedirs(LOG_PATH+category_name,exist_ok=True)

data_parser = DataParser(DATA_PATH)
data_parser.loadResults('level_5', category_name)

    
with concurrent.futures.ThreadPoolExecutor() as executor:
    args_list = [(idx, question, answer) for idx, (question, answer) in enumerate(zip(data_parser.questionsList, data_parser.solutionList))]
    executor.map(run_get_cot_samples, args_list)

In [None]:
# PREALGEBRA
category_index = 5
category_name = CATEGORIES[category_index]
os.makedirs(LOG_PATH+category_name,exist_ok=True)

data_parser = DataParser(DATA_PATH)
data_parser.loadResults('level_5', category_name)

    
with concurrent.futures.ThreadPoolExecutor() as executor:
    args_list = [(idx, question, answer) for idx, (question, answer) in enumerate(zip(data_parser.questionsList, data_parser.solutionList))]
    executor.map(run_get_cot_samples, args_list)

In [None]:
# INTERMEDIATE ALGEBRA
category_index = 6
category_name = CATEGORIES[category_index]
os.makedirs(LOG_PATH+category_name,exist_ok=True)

data_parser = DataParser(DATA_PATH)
data_parser.loadResults('level_5', category_name)

    
with concurrent.futures.ThreadPoolExecutor() as executor:
    args_list = [(idx, question, answer) for idx, (question, answer) in enumerate(zip(data_parser.questionsList, data_parser.solutionList))]
    executor.map(run_get_cot_samples, args_list)

---

In [None]:
# create logs

In [None]:
import os 
import json 

category_index = 0
category_name = CATEGORIES[category_index]
files = os.listdir(LOG_PATH+category_name)

result_logs = {
    'correct_count': 0,
    'wrong_count': 0,
    'question_count': 0
}

result_log_file = open(LOG_PATH + category_name + '.json', 'w')

from collections import Counter

for file in files:
    
    if not file.endswith('.json'):
        continue

    index = int(file.replace('.json', ''))
    result_logs[index] = {}
    
    log = json.loads(open(LOG_PATH+category_name+'/'+file).read())
    
    correct_answer = find_answer(log['correct'])
    model_answers = [find_answer(choice) for choice in log['choices']]
    model_answers_count = Counter(model_answers)
    
    result_logs[index]['correct_answer'] =  correct_answer
    result_logs[index]['answer_candidates'] =  model_answers_count
    
    if 'ERROR' in result_logs[index]['answer_candidates']:
        result_logs[index]['answer_candidates'].pop('ERROR')
    
    selected_answer, count = model_answers_count.most_common(1)[0]
    
    result_logs[index]['selected_answer'] = selected_answer
    result_logs[index]['selected_answer_count'] = count

    
    result_logs['question_count'] += 1
    
    if selected_answer == correct_answer:
        result_logs['correct_count'] += 1
    else:
        result_logs['wrong_count'] += 1
    
json.dump(result_logs, result_log_file, indent=2)

In [None]:
import os 
import json 

category_index = 1
category_name = CATEGORIES[category_index]
files = os.listdir(LOG_PATH+category_name)

result_logs = {
    'correct_count': 0,
    'wrong_count': 0,
    'question_count': 0
}

result_log_file = open(LOG_PATH + category_name + '.json', 'w')

from collections import Counter

for file in files:
    
    if not file.endswith('.json'):
        continue

    index = int(file.replace('.json', ''))
    result_logs[index] = {}
    
    log = json.loads(open(LOG_PATH+category_name+'/'+file).read())
    
    correct_answer = find_answer(log['correct'])
    model_answers = [find_answer(choice) for choice in log['choices']]
    model_answers_count = Counter(model_answers)
    
    result_logs[index]['correct_answer'] =  correct_answer
    result_logs[index]['answer_candidates'] =  model_answers_count
    
    if 'ERROR' in result_logs[index]['answer_candidates']:
        result_logs[index]['answer_candidates'].pop('ERROR')
    
    selected_answer, count = model_answers_count.most_common(1)[0]
    
    result_logs[index]['selected_answer'] = selected_answer
    result_logs[index]['selected_answer_count'] = count

    
    result_logs['question_count'] += 1
    
    if selected_answer == correct_answer:
        result_logs['correct_count'] += 1
    else:
        result_logs['wrong_count'] += 1
    
json.dump(result_logs, result_log_file, indent=2)

In [None]:
import os 
import json 

category_index = 2
category_name = CATEGORIES[category_index]
files = os.listdir(LOG_PATH+category_name)

result_logs = {
    'correct_count': 0,
    'wrong_count': 0,
    'question_count': 0
}

result_log_file = open(LOG_PATH + category_name + '.json', 'w')

from collections import Counter

for file in files:
    
    if not file.endswith('.json'):
        continue

    index = int(file.replace('.json', ''))
    result_logs[index] = {}
    
    log = json.loads(open(LOG_PATH+category_name+'/'+file).read())
    
    correct_answer = find_answer(log['correct'])
    model_answers = [find_answer(choice) for choice in log['choices']]
    model_answers_count = Counter(model_answers)
    
    result_logs[index]['correct_answer'] =  correct_answer
    result_logs[index]['answer_candidates'] =  model_answers_count
    
    if 'ERROR' in result_logs[index]['answer_candidates']:
        result_logs[index]['answer_candidates'].pop('ERROR')
    
    selected_answer, count = model_answers_count.most_common(1)[0]
    
    result_logs[index]['selected_answer'] = selected_answer
    result_logs[index]['selected_answer_count'] = count

    
    result_logs['question_count'] += 1
    
    if selected_answer == correct_answer:
        result_logs['correct_count'] += 1
    else:
        result_logs['wrong_count'] += 1
    
json.dump(result_logs, result_log_file, indent=2)

In [None]:
import os 
import json 

category_index = 3
category_name = CATEGORIES[category_index]
files = os.listdir(LOG_PATH+category_name)

result_logs = {
    'correct_count': 0,
    'wrong_count': 0,
    'question_count': 0
}

result_log_file = open(LOG_PATH + category_name + '.json', 'w')

from collections import Counter

for file in files:
    
    if not file.endswith('.json'):
        continue

    index = int(file.replace('.json', ''))
    result_logs[index] = {}
    
    log = json.loads(open(LOG_PATH+category_name+'/'+file).read())
    
    correct_answer = find_answer(log['correct'])
    model_answers = [find_answer(choice) for choice in log['choices']]
    model_answers_count = Counter(model_answers)
    
    result_logs[index]['correct_answer'] =  correct_answer
    result_logs[index]['answer_candidates'] =  model_answers_count
    
    if 'ERROR' in result_logs[index]['answer_candidates']:
        result_logs[index]['answer_candidates'].pop('ERROR')
    
    selected_answer, count = model_answers_count.most_common(1)[0]
    
    result_logs[index]['selected_answer'] = selected_answer
    result_logs[index]['selected_answer_count'] = count

    
    result_logs['question_count'] += 1
    
    if selected_answer == correct_answer:
        result_logs['correct_count'] += 1
    else:
        result_logs['wrong_count'] += 1
    
json.dump(result_logs, result_log_file, indent=2)

In [None]:
import os 
import json 

category_index = 4
category_name = CATEGORIES[category_index]
files = os.listdir(LOG_PATH+category_name)

result_logs = {
    'correct_count': 0,
    'wrong_count': 0,
    'question_count': 0
}

result_log_file = open(LOG_PATH + category_name + '.json', 'w')

from collections import Counter

for file in files:
    
    if not file.endswith('.json'):
        continue

    index = int(file.replace('.json', ''))
    result_logs[index] = {}
    
    log = json.loads(open(LOG_PATH+category_name+'/'+file).read())
    
    correct_answer = find_answer(log['correct'])
    model_answers = [find_answer(choice) for choice in log['choices']]
    model_answers_count = Counter(model_answers)
    
    result_logs[index]['correct_answer'] =  correct_answer
    result_logs[index]['answer_candidates'] =  model_answers_count
    
    if 'ERROR' in result_logs[index]['answer_candidates']:
        result_logs[index]['answer_candidates'].pop('ERROR')
    
    selected_answer, count = model_answers_count.most_common(1)[0]
    
    result_logs[index]['selected_answer'] = selected_answer
    result_logs[index]['selected_answer_count'] = count

    
    result_logs['question_count'] += 1
    
    if selected_answer == correct_answer:
        result_logs['correct_count'] += 1
    else:
        result_logs['wrong_count'] += 1
    
json.dump(result_logs, result_log_file, indent=2)

In [None]:
import os 
import json 

category_index = 5
category_name = CATEGORIES[category_index]
files = os.listdir(LOG_PATH+category_name)

result_logs = {
    'correct_count': 0,
    'wrong_count': 0,
    'question_count': 0
}

result_log_file = open(LOG_PATH + category_name + '.json', 'w')

from collections import Counter

for file in files:
    
    if not file.endswith('.json'):
        continue

    index = int(file.replace('.json', ''))
    result_logs[index] = {}
    
    log = json.loads(open(LOG_PATH+category_name+'/'+file).read())
    
    correct_answer = find_answer(log['correct'])
    model_answers = [find_answer(choice) for choice in log['choices']]
    model_answers_count = Counter(model_answers)
    
    result_logs[index]['correct_answer'] =  correct_answer
    result_logs[index]['answer_candidates'] =  model_answers_count
    
    if 'ERROR' in result_logs[index]['answer_candidates']:
        result_logs[index]['answer_candidates'].pop('ERROR')
    
    try:
        selected_answer, count = model_answers_count.most_common(1)[0]
    except IndexError:
        selected_answer, count = 'NO SOLUTION FOUND', 0
    
    result_logs[index]['selected_answer'] = selected_answer
    result_logs[index]['selected_answer_count'] = count

    
    result_logs['question_count'] += 1
    
    if selected_answer == correct_answer:
        result_logs['correct_count'] += 1
    else:
        result_logs['wrong_count'] += 1
    
json.dump(result_logs, result_log_file, indent=2)

In [None]:
import os 
import json 

category_index = 6
category_name = CATEGORIES[category_index]
files = os.listdir(LOG_PATH+category_name)

result_logs = {
    'correct_count': 0,
    'wrong_count': 0,
    'question_count': 0
}

result_log_file = open(LOG_PATH + category_name + '.json', 'w')

from collections import Counter

for file in files:
    
    if not file.endswith('.json'):
        continue

    index = int(file.replace('.json', ''))
    result_logs[index] = {}
    
    log = json.loads(open(LOG_PATH+category_name+'/'+file).read())
    
    correct_answer = find_answer(log['correct'])
    model_answers = [find_answer(choice) for choice in log['choices']]
    model_answers_count = Counter(model_answers)
    
    result_logs[index]['correct_answer'] =  correct_answer
    result_logs[index]['answer_candidates'] =  model_answers_count
    
    if 'ERROR' in result_logs[index]['answer_candidates']:
        result_logs[index]['answer_candidates'].pop('ERROR')
    
    try:
        selected_answer, count = model_answers_count.most_common(1)[0]
    except IndexError:
        selected_answer, count = 'NO SOLUTION FOUND', 0
    
    result_logs[index]['selected_answer'] = selected_answer
    result_logs[index]['selected_answer_count'] = count

    
    result_logs['question_count'] += 1
    
    if selected_answer == correct_answer:
        result_logs['correct_count'] += 1
    else:
        result_logs['wrong_count'] += 1
    
json.dump(result_logs, result_log_file, indent=2)