In [10]:
%load_ext autoreload
%autoreload 2

import json
import os
from evaluator import Evaluator
from universal_runner import ModelConfig, OpenAIModelRunner
from loaders import TaskLoader
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load API keys and task paths

In [11]:
# Get API keys
with open("utils/api_keys.json", "r") as f:
    api_keys = json.load(f)

# Load task paths
with open("test_specs/test_list.json", 'r') as file:
    all_task_paths = []
    for stage in json.load(file):
        all_task_paths.extend(stage['task_paths'])

print(f"Found {len(all_task_paths)} tasks")
print("Sample tasks:")
for i, path in enumerate(all_task_paths[:3]):
    print(f"  {i}: {path}")

Found 31 tasks
Sample tasks:
  0: test_specs/low/borb_orientation_meta.json
  1: test_specs/low/borb_line_length_comparison_meta.json
  2: test_specs/low/borb_size_comparison_meta.json


## Demo with OpenAI API

### Running a single task ###

In [14]:
# Configure OpenAI model
openai_config = ModelConfig(
    model_name="gpt-4o",
    api_key=api_keys["open_ai"],
    max_tokens=100,
    temperature=1.0
)

# Create runner using factory function
runner = OpenAIModelRunner(openai_config)

# Run a single task
task_index = 12 
print(f"Running task: {all_task_paths[task_index]}")

loader = TaskLoader(all_task_paths[task_index]) # Use TaskLoader to load the task
results = runner.generate_response(loader) # return task info and results (payload + model responses)

print(f"Task: {results[0]['task']}")
print(f"Stage: {results[0]['stage']}") # stage of the task: low, mid, or high
print(f"Process: {results[0]['process']}") # which finer-grained process the task belongs to
print(f"Number of trials: {len(results[1])}") # number of trials in the task

Running task: test_specs/mid/mindset_decomposition_meta.json


Getting model responses: 100%|██████████| 30/30 [00:41<00:00,  1.38s/it]

Task: mindset_decomposition
Stage: mid
Process: property_biases
Number of trials: 30





In [16]:
# Evaluate results
evaluator = Evaluator()
evaluator.evaluate(results)

# Display results
results_df = evaluator.get_result() # get pandas dataframe
results_df

Unnamed: 0,task,task_type,stage,process,num_trials,raw_score,percent_score
0,mindset_decomposition,match_to_sample,mid,property_biases,30,26,0.866667


## Inspect Payloads
Each task payload contains `task_info` and `results`
Task info is a dictionary containing the task information.
Results is a list of dictionaries containing the trial information (e.g., prompt, conversation content, answer key, and model response for each trial).

In [37]:
# Show first few responses
print("Sample Model Responses:")
print("=" * 50)

# Look at the results
for i, trial in enumerate(results[:3]):
    print(f"\nTrial {trial['trial_id']}:")
    print(f"Prompt: {trial['prompt'][:100]}...") # Prompts are same for all the trials within a task.
    print(f"Model Response: {trial['model_response']}") # Final model responses are those indicated within {}
    print(f"Correct Answer: {trial['answer_key']}")
    print("-" * 30)

Sample Model Responses:

Trial trial_000:
Prompt: Here's a task. I will present you with three pictures. The first one is the target/reference image. ...
Model Response: {2}
Correct Answer: 1
------------------------------

Trial trial_001:
Prompt: Here's a task. I will present you with three pictures. The first one is the target/reference image. ...
Model Response: The second option most closely resembles the target/reference image. 

{2}
Correct Answer: 2
------------------------------

Trial trial_002:
Prompt: Here's a task. I will present you with three pictures. The first one is the target/reference image. ...
Model Response: {2}
Correct Answer: 1
------------------------------


## Batch Processing Demo
Run multiple tasks

In [17]:
# Run multiple tasks
batch_evaluator = Evaluator()
batch_results = []

for i, task_path in enumerate(all_task_paths[:3]): 
    print(f"\nProcessing task {i+1}/3: {task_path}")
    
    loader = TaskLoader(task_path)
    runner = OpenAIModelRunner(openai_config)
    results = runner.generate_response(loader)
    batch_evaluator.evaluate(results)
    print(f"✓ Completed: {task_info['task']}")


Processing task 1/3: test_specs/low/borb_orientation_meta.json


Getting model responses: 100%|██████████| 30/30 [00:24<00:00,  1.24it/s]


✓ Completed: mindset_decomposition

Processing task 2/3: test_specs/low/borb_line_length_comparison_meta.json


Getting model responses: 100%|██████████| 30/30 [00:24<00:00,  1.22it/s]


✓ Completed: mindset_decomposition

Processing task 3/3: test_specs/low/borb_size_comparison_meta.json


Getting model responses: 100%|██████████| 30/30 [00:27<00:00,  1.08it/s]

✓ Completed: mindset_decomposition





In [None]:
# Show batch results
batch_evaluator.get_result() # get_results appends all the results evaluated after Evaluator initialization

Unnamed: 0,task,task_type,stage,process,num_trials,raw_score,percent_score
0,borb_orientation,yes_no,low,simple_element_judgements,30,21,0.7
1,borb_line_length_comparison,same_different,low,simple_element_judgements,30,20,0.666667
2,borb_size_comparison,same_different,low,simple_element_judgements,30,17,0.566667
