In [48]:
import json
import os
import openai
from dotenv import load_dotenv
import anthropic

In [53]:
# Read QA questions from the json file
with open('shuffled_QA_pairs_econ_part2.json') as f:
    data = json.load(f)

# Load the .env file
load_dotenv()
client = openai.OpenAI()
claude_client = anthropic.Anthropic()

In [14]:
data[0]['choices']

['Nonexcludable goods and services include police protection and most roads, as it is challenging to prevent individuals from benefiting from these once they are provided. Public primary education also falls into this category, as it is generally accessible to all, despite the existence of private options that may be excludable. In contrast, streaming music services, cell phone service, and toll roads are excludable, as access is limited to paying customers or users.',
 'Among the options provided, roads and public primary education are considered nonexcludable. Most roads are nonexcludable because, once constructed, it is difficult to prevent individuals from using them, although toll roads can create exceptions. Public primary education is generally nonexcludable as it is available to all children regardless of payment. In contrast, police protection is excludable since access can be limited to those who pay for private security services. Similarly, streaming music services like Siri

In [None]:
from tqdm import tqdm
from collections import defaultdict
log = defaultdict(dict)

for generated_qa in tqdm(data, desc="Generating answers"):
    question = generated_qa['question']
    correct_answer = generated_qa['correct_answer']
    choices = generated_qa['mcq']

    prompt = f"""
    Question: {choices}
    Which of the above choices best answers the question? Give a reason for your choice first, then give your choice.
    Reply a JSON object with the following format:
    {{
        "reason": "string",
        "choice": "int",
    }}
    """
    temperature = 0.0

    response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a student answering a textbook problem."},
                    {"role": "user", "content": prompt}
                ],
                temperature=temperature,
                response_format={"type": "json_object"}
            )
    
    answer_json = json.loads(response.choices[0].message.content)

    log[question]['reason'] = answer_json['reason']
    log[question]['gpt_choice'] = answer_json['choice']
    log[question]['correct_choice'] = correct_answer + 1

Generating answers: 100%|██████████| 161/161 [04:41<00:00,  1.75s/it]


In [94]:
correct_count = 0
total_count = len(log)

for _, answers in log.items():
    if answers['gpt_choice'] == answers['correct_choice'] + 1:
        correct_count += 1

percentage_correct = (correct_count / total_count) * 100
print(f"Percentage of correct choices: {percentage_correct:.2f}%")

Percentage of correct choices: 65.22%


In [80]:
log[list(log.keys())[0]]

{'reason': 'Choice D best answers the question because it correctly identifies police protection and most roads as nonexcludable, acknowledging the difficulty in preventing individuals from benefiting from these services once provided. It also correctly notes that public primary education is generally nonexcludable, while distinguishing that streaming music services, cell phone service, and toll roads are excludable due to their access restrictions.',
 'gpt_choice': 4,
 'correct_choice': 3}

In [None]:
from anthropic.types.beta.message_create_params import MessageCreateParamsNonStreaming
from anthropic.types.beta.messages.batch_create_params import Request
requests = []

for generated_qa in tqdm(data, desc="Generating answers"):
    question = generated_qa['question']
    correct_answer = generated_qa['correct_answer']
    choices = generated_qa['mcq']

    prompt = f"""
    Question: {choices}
    Which of the above choices best answers the question? Give a reason for your choice first, then give your choice as an integer between 0 and 3.
    Reply only a JSON object with the following format:
    {{
        "reason": "string",
        "choice": "int",
    }}
    """
    temperature = 0.0
    params = MessageCreateParamsNonStreaming(
                model="claude-3-5-haiku-latest",
                system = "You are a student answering a textbook problem.",
                messages=[
                    {"role": "user", "content": prompt}
                ],
                temperature=temperature,
                max_tokens=1000,
            )
    
    custom_id = str(hash(question))
    requests.append(Request(custom_id=custom_id, params=params))
    break
    
message_batch = claude_client.beta.messages.batches.create(requests=requests)

Generating answers:   0%|          | 0/161 [00:00<?, ?it/s]


BetaMessageBatch(id='msgbatch_01C2C1yfU2Bac2nApYcucpAf', archived_at=None, cancel_initiated_at=None, created_at=datetime.datetime(2024, 11, 18, 0, 16, 50, 99607, tzinfo=datetime.timezone.utc), ended_at=None, expires_at=datetime.datetime(2024, 11, 19, 0, 16, 50, 99607, tzinfo=datetime.timezone.utc), processing_status='in_progress', request_counts=BetaMessageBatchRequestCounts(canceled=0, errored=0, expired=0, processing=1, succeeded=0), results_url=None, type='message_batch')


In [133]:
anthropic_log = defaultdict(dict)

batch_id = message_batch.id

for result in claude_client.beta.messages.batches.results(batch_id):
    if result.result.type == "succeeded":
        print(f"Success! {result.custom_id}")


# answer_json = json.loads(response.content[0].text)

#     anthropic_log[question]['reason'] = answer_json['reason']
#     anthropic_log[question]['gpt_choice'] = answer_json['choice']
#     anthropic_log[question]['correct_choice'] = correct_answer

AnthropicError: No `results_url` for the given batch; Has it finished processing? in_progress

In [117]:
correct_count = 0
total_count = len(anthropic_log)

for _, answers in log.items():
    if answers['gpt_choice'] == answers['correct_choice'] + 1:
        correct_count += 1

percentage_correct = (correct_count / total_count) * 100
print(f"Percentage of correct choices: {percentage_correct:.2f}%")

Percentage of correct choices: 10500.00%
