In [58]:
import yaml
prompt_path = "prompts.yaml"
prompt_name = "75_percent_prompt"
with open(prompt_path, "r", encoding='utf-8') as prompt_file:
    prompt_dict = yaml.safe_load(prompt_file)
prompt = prompt_dict[prompt_name]

In [59]:
from datasets import load_dataset
import dspy
def init_dataset():
    test_split  = load_dataset("AI-MO/aimo-validation-aime")['train']
    test_split  = [
        dspy.Example({
            "problem": prompt + "\nHere is problem:\n" + x['problem'],
            'answer': x['answer'],
        }).with_inputs("problem")
        for x in test_split 
    ]
    test_set = test_split[:150]
    print(f"Generated test set with {len(test_set)} examples.")
    return test_set

In [60]:
test_set = init_dataset()

Generated test set with 90 examples.


In [61]:
from dotenv import load_dotenv
import dspy
import os

In [62]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [63]:
lm = dspy.LM("openai/gpt-4.1-mini", temperature=1, api_key=api_key, max_tokens=32000)
dspy.configure(lm=lm)

In [64]:
class GenerateResponse(dspy.Signature):
    """Solve the problem and provide the answer in the correct format."""
    problem = dspy.InputField()
    answer = dspy.OutputField()

program = dspy.ChainOfThought(GenerateResponse)

In [65]:
def metric(example, prediction, trace=None, pred_name=None, pred_trace=None):
    """
    An evaluation metric for our evaluator
    """
    correct_answer = int(example['answer'])
    try:
        llm_answer = int(prediction.answer)
    except ValueError as e:
        return 0
    return int(correct_answer == llm_answer)

In [66]:
import dspy
evaluate = dspy.Evaluate(
    devset=test_set,
    metric=metric,
    num_threads=32,
    display_table=True,
    display_progress=True
)

score, results = evaluate(program)

Average Metric: 35.00 / 90 (38.9%): 100%|██████████| 90/90 [03:18<00:00,  2.21s/it]

2025/10/29 19:56:08 INFO dspy.evaluate.evaluate: Average Metric: 35 / 90 (38.9%)





Unnamed: 0,problem,example_answer,reasoning,pred_answer,metric
0,You are a helpful assistant. You are given one math question and m...,116,- Let P(x) = 2x^2 + bx + c and Q(x) = -2x^2 + dx + e. - Both pass ...,116,✔️ [1]
1,You are a helpful assistant. You are given one math question and m...,756,"- Let the spheres have centers O₁, O₂, O₃ and radii 11, 13, and 19...",756,✔️ [1]
2,You are a helpful assistant. You are given one math question and m...,150,- Let the parallelogram be \(ABCD\) with \(\angle BAD < 90^\circ\)...,95,
3,You are a helpful assistant. You are given one math question and m...,245,"- We have the set {1, ..., n}, and sum over all ordered pairs (A,B...",145,
4,You are a helpful assistant. You are given one math question and m...,392,Each number in S has a repeating decimal expansion with period 4 d...,999,
...,...,...,...,...,...
85,You are a helpful assistant. You are given one math question and m...,080,- We have a convex equilateral hexagon ABCDEF with all pairs of op...,370,
86,You are a helpful assistant. You are given one math question and m...,055,"- Let the set A = {a_1, a_2, ..., a_k} with positive integers. - W...",49,
87,You are a helpful assistant. You are given one math question and m...,699,We want the greatest four-digit number N = 1000Q + R (Q = thousand...,699,✔️ [1]
88,You are a helpful assistant. You are given one math question and m...,127,- The torus is formed by revolving a circle of radius 3 around an ...,5,


results
