In [None]:
%load_ext autoreload
%autoreload 2

import json
import os
from evaluator import Evaluator
from runner import ModelConfig, OpenAIModelRunner
from loaders import TaskLoader
import pandas as pd

## Load API keys and task paths

In [None]:
# Get API keys
with open("utils/api_keys.json", "r") as f:
    api_keys = json.load(f)

# Load task paths
with open("test_specs/test_list.json", 'r') as file:
    all_task_paths = []
    for stage in json.load(file):
        all_task_paths.extend(stage['task_paths'])

print(f"Found {len(all_task_paths)} tasks")
print("Sample tasks:")
for i, path in enumerate(all_task_paths[:3]): # print first 3 tasks
    print(f"  {i}: {path}")

## Demo with OpenAI API

### Running a single task ###

In [None]:
# Configure OpenAI model
openai_config = ModelConfig(
    model_name="gpt-4o-2024-05-13",
    api_key=api_keys["open_ai"],
    max_tokens=1000,
    temperature=1.0
)

# Create runner using factory function
runner = OpenAIModelRunner(openai_config)

# Run a single task
task_index = 12 
print(f"Running task: {all_task_paths[task_index]}")

loader = TaskLoader(all_task_paths[task_index]) # Use TaskLoader to load the task
results = runner.generate_response(loader) # return task info and results (payload + model responses)

print(f"Task: {results[0]['task']}")
print(f"Stage: {results[0]['stage']}") # stage of the task: low, mid, or high
print(f"Process: {results[0]['process']}") # which finer-grained process the task belongs to
print(f"Number of trials: {len(results[1])}") # number of trials in the task

In [None]:
# Evaluate results
evaluator = Evaluator()
evaluator.evaluate(results)

# Display results
results_df = evaluator.get_result() # get pandas dataframe
results_df

## Inspect Payloads
Each task payload contains `task_info` and `results`
Task info is a dictionary containing the task information.
Results is a list of dictionaries containing the trial information (e.g., prompt, conversation content, answer key, and model response for each trial).

In [None]:
# Show first few responses
print("Sample Model Responses:")
print("=" * 50)

# Look at the results
for i, trial in enumerate(results[1][:3]):
    print(f"\nTrial {trial['trial_id']}:")
    print(f"Prompt: {trial['prompt'][:100]}...") # Prompts are same for all the trials within a task.
    print(f"Model Response: {trial['model_response']}") # Final model responses are those indicated within {}
    print(f"Correct Answer: {trial['answer_key']}")
    print("-" * 30)

## Batch Processing Demo
Run multiple tasks

In [None]:
# Run multiple tasks
batch_evaluator = Evaluator()
batch_results = []

for i, task_path in enumerate(all_task_paths): 
    print(f"\nProcessing task {i+1}/{len(all_task_paths)}: {task_path}")
    
    loader = TaskLoader(task_path)
    runner = OpenAIModelRunner(openai_config)
    results = runner.generate_response(loader)
    batch_evaluator.evaluate(results)
    print(f"✓ Completed: {results[0]['task']}")

In [None]:
# Show batch results
batch_evaluator.get_result() # get_results appends all the results evaluated after Evaluator initialization
batch_evaluator.save_as_csv(f"results_{openai_config.model_name}.csv")