In [1]:
import json
import os
import openai
from dotenv import load_dotenv
# import anthropic

In [2]:
# Read QA questions from the json file
with open('shuffled_QA_pairs_econ_part2.json') as f:
    data = json.load(f)

# Load the .env file
load_dotenv()
client = openai.OpenAI()
# claude_client = anthropic.Anthropic()

In [None]:
from tqdm import tqdm
from collections import defaultdict
log = defaultdict(dict)

for generated_qa in tqdm(data, desc="Generating answers"):
    question = generated_qa['question']
    correct_answer = generated_qa['correct_answer']
    choices = generated_qa['mcq']

    prompt = f"""
    Question: {choices}
    Which of the above choices best answers the question? Give a reason for your choice first, then give your choice.
    Reply a JSON object with the following format:
    {{
        "reason": "string",
        "choice": "int",
    }}
    """
    temperature = 0.0

    response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a student answering a textbook problem."},
                    {"role": "user", "content": prompt}
                ],
                temperature=temperature,
                response_format={"type": "json_object"}
            )
    
    answer_json = json.loads(response.choices[0].message.content)

    log[question]['reason'] = answer_json['reason']
    log[question]['gpt_choice'] = answer_json['choice']
    log[question]['correct_choice'] = correct_answer + 1

Generating answers: 100%|██████████| 161/161 [04:41<00:00,  1.75s/it]


In [94]:
correct_count = 0
total_count = len(log)

for _, answers in log.items():
    if answers['gpt_choice'] == answers['correct_choice'] + 1:
        correct_count += 1

percentage_correct = (correct_count / total_count) * 100
print(f"Percentage of correct choices: {percentage_correct:.2f}%")

Percentage of correct choices: 65.22%


In [2]:
# Read QA questions from the json file
with open('shuffled_QA_pairs_econ_part2.json') as f:
    data = json.load(f)

# Load the .env file
load_dotenv()
client = openai.OpenAI()
# claude_client = anthropic.Anthropic()

In [4]:
import re

for i, q in enumerate(data):    
    # remove the empty newlines and first few numbers followed by a period in the question by regex    
    clean_question = re.sub(r'^\s*\d+\.\s*', '', q['question'])    
    data[i]['clean_question'] = clean_question

In [7]:
from tqdm import tqdm
from collections import defaultdict
classification_log = defaultdict(dict)

for generated_qa in tqdm(data, desc="Generating answers"):
    question_key = generated_qa['question']
    question = generated_qa['clean_question']

    system_prompt = """
    You will be presented with a problem pulled from a finance textbook. 
    Your goal is to determine three things: whether the question has enough information to be answered, if it does, what type of question it is, and if it is fit for use as a multiple-choice question.
    
    When determining if the question has enough information, classify it as one of the following:
    1. Incomplete: Is the question complete? Are there any missing words or phrases, or references to other parts of the text that are not included?
    2. Missing reference: Does the question contain all the reference materials? Are there any missing tables, charts, or other information that is necessary to answer the question?
    3. Missing context: Does the question contain all the necessary context? Does it ask for information from sections in the textbook, which are not included in the question?
    9. Other: The question does not have enough information, but it does not fit into any of the above categories.
    0. Complete: The question has all the necessary information to be answered.

    When determining the type of question, classify the question as one of the following:
    1. Conceptual: The question asks for an explanation of a concept or theory.
    2. Computation: The question asks for a calculation or numerical answer.
    3. True/False: The question asks for a true or false answer.
    4. Graphical: The question asks for a graph or chart to be drawn.
    5. Reasoning: The question asks for a logical explanation or reasoning.
    9. Other: The question does not fit into any of the above categories.

    When determining if the question is fit for use as a multiple-choice question, simply answer yes or no by assigning a 1 or 0, respectively.

    You will also be asked to provide a reason for your answer. Please provide a detailed explanation for your choice.
    When you answer, use the associated number to indicate your choice for each of the three questions.
    For example, a complete question that is a conceptual question and is not fit for use as a multiple-choice question would be answered with:
    0, 1, 0
    """

    question_prompt = f"""
    Question: {question}
    For your answer, reply a JSON object with the following format:
    {{
        "completeness_reason": "string",
        "type_reason": "string",
        "mcq_reason": "string",
        "completeness_choice": "int",
        "type_choice": "int",
        "mcq_choice": "int"
    }}
    """
    temperature = 0.0

    response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question_prompt}
                ],
                temperature=temperature,
                response_format={"type": "json_object"}
            )
    
    answer_json = json.loads(response.choices[0].message.content)

    answer_dict = {
        "completeness_reason": answer_json['completeness_reason'],
        "type_reason": answer_json['type_reason'],
        "mcq_reason": answer_json['mcq_reason'],
        "completeness_choice": answer_json['completeness_choice'],
        "type_choice": answer_json['type_choice'],
        "mcq_choice": answer_json['mcq_choice']
    }

    classification_log[question_key] = answer_dict

Generating answers: 100%|██████████| 161/161 [04:56<00:00,  1.84s/it]


In [15]:
import pickle
with open('log.pkl', 'wb') as f:
    pickle.dump(log, f)

In [9]:
import pickle
with open('classification_log.pkl', 'wb') as f:
    pickle.dump(classification_log, f)

In [11]:
import pickle
with open('log.pkl', 'rb') as f:
    log = pickle.load(f)

In [12]:
for original_qa in data:
    question = original_qa['question']
    log[question]['question'] = original_qa['mcq']

In [14]:
for i, individual_data in enumerate(data):
    question = individual_data['question']
    log[question]['index'] = i

In [5]:
import random

for _ in range(10):
    question = random.choice(data)['question']
    print(f"Question: {log[question]['question']}")
    print(f"Correct choice: {log[question]['correct_choice'] + 1}")
    print(f"GPT choice: {log[question]['gpt_choice']}")
    print(f"Reason: {log[question]['reason']}")
    print()
    print()

Question: 
11. 

 Select the correct answer. A price floor will usually shift:

demand
supply
both
neither

Illustrate your answer with a diagram.


A: A price floor is a minimum price set by the government that must be paid for a good or service, and it typically shifts both the demand and supply curves, leading to changes in market equilibrium. When a price floor is established above the equilibrium price, it creates a condition of excess supply, where the quantity supplied exceeds the quantity demanded. This disruption of the balance at equilibrium can be illustrated with a diagram showing the demand and supply curves intersecting at the equilibrium point, with the price floor positioned above this intersection, highlighting the resulting excess supply in the market.
B: A price floor is a minimum price established by the government that must be paid for a good or service, and it does not shift the demand or supply curves; rather, it sets a minimum price that can disrupt market equil

In [None]:
from anthropic.types.beta.message_create_params import MessageCreateParamsNonStreaming
from anthropic.types.beta.messages.batch_create_params import Request
requests = []

for generated_qa in tqdm(data, desc="Generating answers"):
    question = generated_qa['question']
    correct_answer = generated_qa['correct_answer']
    choices = generated_qa['mcq']

    prompt = f"""
    Question: {choices}
    Which of the above choices best answers the question? Give a reason for your choice first, then give your choice as an integer between 0 and 3.
    Reply only a JSON object with the following format:
    {{
        "reason": "string",
        "choice": "int",
    }}
    """
    temperature = 0.0
    params = MessageCreateParamsNonStreaming(
                model="claude-3-5-haiku-latest",
                system = "You are a student answering a textbook problem.",
                messages=[
                    {"role": "user", "content": prompt}
                ],
                temperature=temperature,
                max_tokens=1000,
            )
    
    custom_id = str(hash(question))
    requests.append(Request(custom_id=custom_id, params=params))
    break
    
message_batch = claude_client.beta.messages.batches.create(requests=requests)

Generating answers:   0%|          | 0/161 [00:00<?, ?it/s]


BetaMessageBatch(id='msgbatch_01C2C1yfU2Bac2nApYcucpAf', archived_at=None, cancel_initiated_at=None, created_at=datetime.datetime(2024, 11, 18, 0, 16, 50, 99607, tzinfo=datetime.timezone.utc), ended_at=None, expires_at=datetime.datetime(2024, 11, 19, 0, 16, 50, 99607, tzinfo=datetime.timezone.utc), processing_status='in_progress', request_counts=BetaMessageBatchRequestCounts(canceled=0, errored=0, expired=0, processing=1, succeeded=0), results_url=None, type='message_batch')


In [133]:
anthropic_log = defaultdict(dict)

batch_id = message_batch.id

for result in claude_client.beta.messages.batches.results(batch_id):
    if result.result.type == "succeeded":
        print(f"Success! {result.custom_id}")


# answer_json = json.loads(response.content[0].text)

#     anthropic_log[question]['reason'] = answer_json['reason']
#     anthropic_log[question]['gpt_choice'] = answer_json['choice']
#     anthropic_log[question]['correct_choice'] = correct_answer

AnthropicError: No `results_url` for the given batch; Has it finished processing? in_progress

In [117]:
correct_count = 0
total_count = len(anthropic_log)

for _, answers in log.items():
    if answers['gpt_choice'] == answers['correct_choice'] + 1:
        correct_count += 1

percentage_correct = (correct_count / total_count) * 100
print(f"Percentage of correct choices: {percentage_correct:.2f}%")

Percentage of correct choices: 10500.00%
