In [2]:
from dotenv import load_dotenv
import dspy
import os

In [3]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [4]:
lm = dspy.LM("openai/gpt-4.1-mini", temperature=1, api_key=api_key, max_tokens=32000)
dspy.configure(lm=lm)

In [5]:
import dspy
from datasets import load_dataset

def init_dataset():
    train_split = load_dataset("AI-MO/aimo-validation-aime")['train']
    train_split = [
        dspy.Example({
            "problem": x['problem'],
            'solution': x['solution'],
            'answer': x['answer'],
        }).with_inputs("problem")
        for x in train_split
    ]
    import random
    random.Random(0).shuffle(train_split)
    tot_num = len(train_split)

    test_split = load_dataset("MathArena/aime_2025")['train']
    test_split = [
        dspy.Example({
            "problem": x['problem'],
            'answer': x['answer'],
        }).with_inputs("problem")
        for x in test_split
    ]

    train_set = train_split[:int(0.5 * tot_num)]
    val_set = train_split[int(0.5 * tot_num):]
    test_set = test_split * 5

    return train_set, val_set, test_set

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
train_set, val_set, test_set = init_dataset()

In [14]:
test_point = 0
print("Problem:")
print(train_set[test_point]['problem'])
# print("\n\nSolution:")
# print(train_set[test_point]['solution'])
print("\n\nAnswer:")
print(train_set[test_point]['answer'])

Problem:
In isosceles trapezoid $ABCD$, parallel bases $\overline{AB}$ and $\overline{CD}$ have lengths $500$ and $650$, respectively, and $AD=BC=333$. The angle bisectors of $\angle{A}$ and $\angle{D}$ meet at $P$, and the angle bisectors of $\angle{B}$ and $\angle{C}$ meet at $Q$. Find $PQ$.


Answer:
242


In [13]:
print(f"{len(test_set)=}, {len(train_set)=}, {len(val_set)=}")

len(test_set)=150, len(train_set)=45, len(val_set)=45


In [10]:
class GenerateResponse(dspy.Signature):
    """Solve the problem and provide the answer in the correct format."""
    problem = dspy.InputField()
    answer = dspy.OutputField()

program = dspy.ChainOfThought(GenerateResponse)

In [None]:
def metric(example, prediction, trace=None, pred_name=None, pred_trace=None):
    """
    An evaluation metric for our evaluator
    """
    correct_answer = int(example['answer'])
    try:
        llm_answer = int(prediction.answer)
    except ValueError as e:
        return 0
    return int(correct_answer == llm_answer)

In [12]:
import dspy
evaluate = dspy.Evaluate(
    devset=test_set,
    metric=metric,
    num_threads=32,
    display_table=True,
    display_progress=True
)

evaluate(program)

Average Metric: 71.00 / 150 (47.3%): 100%|██████████| 150/150 [04:24<00:00,  1.76s/it]

2025/10/07 20:59:42 INFO dspy.evaluate.evaluate: Average Metric: 71 / 150 (47.3%)





Unnamed: 0,problem,example_answer,reasoning,pred_answer,metric
0,Find the sum of all integer bases $b>9$ for which $17_b$ is a divi...,70,We are given the problem of finding all integer bases \(b > 9\) fo...,70,✔️ [1]
1,"On $\triangle ABC$ points $A, D, E$, and $B$ lie in that order on ...",588,Let's analyze the problem step-by-step. **Given:** - On side \( AB...,588,✔️ [1]
2,The 9 members of a baseball team went to an ice-cream parlor after...,16,"We have 9 players, each choosing a flavor from {chocolate (C), van...",16,✔️ [1]
3,"Find the number of ordered pairs $(x,y)$, where both $x$ and $y$ a...",117,"We want to find integer pairs \((x,y)\) with \(-100 \leq x,y \leq ...",117,✔️ [1]
4,There are $8!= 40320$ eight-digit positive integers that use each ...,279,We are asked to find the number of eight-digit integers formed fro...,279,✔️ [1]
...,...,...,...,...,...
145,Let $S$ be the set of vertices of a regular $24$-gon. Find the num...,113,"We are given a regular 24-gon with vertex set \( S \), and we want...",4480,
146,Let $A_1 A_2 A_3 \ldots A_{11}$ be an $11$-sided non-convex simple...,19,We have an 11-sided polygon \( A_1 A_2 \ldots A_{11} \) with the f...,19,✔️ [1]
147,"Let $x_1, x_2, x_3, \ldots$ be a sequence of rational numbers defi...",248,We are given a sequence \((x_k)\) defined as: \[ x_1 = \frac{25}{1...,616,
148,Let $\triangle ABC$ be a right triangle with $\angle A = 90^\circ$...,104,Given a right triangle \(\triangle ABC\) with \(\angle A = 90^\cir...,104,✔️ [1]


EvaluationResult(score=47.33, results=<list of 150 results>)

In [15]:
def metric_with_feedback(example, prediction, trace=None, pred_name=None, pred_trace=None):
    correct_answer = int(example['answer'])
    written_solution = example.get('solution', '')
    try:
        llm_answer = int(prediction.answer)
    except ValueError as e:
        feedback_text = f"The final answer must be a valid integer and nothing else. You responded with '{prediction.answer}', which couldn't be parsed as a python integer. Please ensure your answer is a valid integer without any additional text or formatting."
        feedback_text += f" The correct answer is '{correct_answer}'."
        if written_solution:
            feedback_text += f" Here's the full step-by-step solution:\n{written_solution}\n\nThink about what takeaways you can learn from this solution to improve your future answers and approach to similar problems and ensure your final answer is a valid integer."
        return dspy.Prediction(score=0, feedback=feedback_text)

    score = int(correct_answer == llm_answer)

    feedback_text = ""
    if score == 1:
        feedback_text = f"Your answer is correct. The correct answer is '{correct_answer}'."
    else:
        feedback_text = f"Your answer is incorrect. The correct answer is '{correct_answer}'."
    
    if written_solution:
        feedback_text += f" Here's the full step-by-step solution:\n{written_solution}\n\nThink about what takeaways you can learn from this solution to improve your future answers and approach to similar problems."

    return dspy.Prediction(score=score, feedback=feedback_text)

In [None]:
from dspy import GEPA

optimizer = GEPA(
    metric=metric_with_feedback,
    #Changing this auto value to higher values will lead to better perfromance
    auto="light",
    num_threads=32,
    track_stats=True,
    reflection_minibatch_size=3,
    reflection_lm=dspy.LM(model="gpt-5", temperature=1.0, max_tokens=32000, api_key=api_key)
)

optimized_program = optimizer.compile(
    program,
    trainset=train_set,
    valset=val_set,
)

2025/10/07 21:24:45 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 560 metric calls of the program. This amounts to 6.22 full evals on the train+val set.
2025/10/07 21:24:45 INFO dspy.teleprompt.gepa.gepa: Using 45 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
GEPA Optimization:   0%|          | 0/560 [00:00<?, ?rollouts/s]2025/10/07 21:28:34 INFO dspy.evaluate.evaluate: Average Metric: 17.0 / 45 (37.8%)
2025/10/07 21:28:34 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.37777777777777777
GEPA Optimization:   8%|▊         | 45/560 [03:49<43:46,  5.10s/rollouts]2025/10/07 21:28:34 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.37777777777777777


Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [01:03<00:00, 21.04s/it] 

2025/10/07 21:29:37 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/07 21:31:22 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for predict: Task: Solve the given math problem and return your work in a strict two-part output format.

Input format:
- You will receive a single field named "problem" containing the full problem statement as plain text.

Output format:
- Provide exactly two top-level sections, in this order:
  1) reasoning
  2) answer
- The "reasoning" section should contain a concise, correct solution outline with key steps and justifications.
- The "answer" section must contain only the final result requested by the problem (e.g., a single integer or expression), with no extra words, symbols, or formatting.

General solution guidelines:
- Aim for a short, logically structured solution. Prefer algebraic identities, modular arithmetic, structural observations, and bounding arguments over brute force. If enumeration is required, keep it minimal and systematic, and justify completeness.
- Always verify candidate solutio

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:50<00:00, 16.99s/it]

2025/10/07 21:34:51 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/07 21:36:06 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for predict: You are given a single math problem. Solve it correctly and return your work in a strict, parseable format.

Output format:
- Produce exactly two sections in this order:
  1) A section header "### reasoning" followed by your step-by-step solution and checks.
  2) A section header "### answer" followed by ONLY the final numeric answer as a plain integer (no LaTeX, no words, no punctuation, no boxes, no quotes, no units).
- The content under "### answer" must be a valid Python int literal. Do not include any additional text, symbols, or formatting. Example: 242

General guidelines:
- Use exact arithmetic whenever possible; avoid decimal approximations unless explicitly required. If the final result is an integer, do not present approximations.
- If the problem asks for a remainder modulo m, ensure your answer is the unique integer in [0, m−1].
- Do not include extra commentary or formatting ar

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [02:03<00:00, 41.31s/it] 

2025/10/07 21:58:11 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/07 22:00:02 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for predict: You will be given a single “problem” to solve. Your task is to compute the correct result and output ONLY the final answer in the exact format the problem requests (typically a single integer). Do not include units, explanations, steps, or extra text.

General formatting rules:
- Output only the final value required (e.g., 756 or 751 or 73).
- If the problem asks for m+n after expressing a fraction m/n in lowest terms, compute that and output the integer m+n.
- If the problem asks for a squared distance, output the integer value of the square (do not simplify to radicals unless explicitly asked).
- No LaTeX, no “boxed”, no labels, no reasoning—just the answer.

Problem-solving guidance and domain-specific methods (use internally to ensure correctness):

A) Plane intersecting tangent spheres with congruent circular cross-sections:
- For spheres with radii r1, r2, r3 and centers mutually exter

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:50<00:00, 16.98s/it] 

2025/10/07 22:05:41 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/07 22:07:37 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for predict: You will be given a single “problem” to solve. Your task is to compute the correct result and output ONLY the final answer in the exact format the problem requests (typically a single integer). Do not include units, explanations, steps, or extra text.

General formatting rules:
- Output only the final value required (e.g., 756 or 751 or 73).
- If the problem asks for m+n after expressing a fraction m/n in lowest terms, compute that and output the integer m+n.
- If the problem asks for a squared distance, output the integer value of the square (do not simplify to radicals unless explicitly asked).
- No LaTeX, no “boxed”, no labels, no reasoning—just the answer.

Domain-specific methods and robust workflows (use internally to ensure correctness):

A) Plane intersecting tangent spheres with congruent circular cross-sections:
- For spheres with radii r1, r2, r3 and centers mutually externally ta

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [01:49<00:00, 36.57s/it]

2025/10/07 22:12:08 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/07 22:13:43 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for predict: You are given a single math problem. Solve it correctly and return your work in a strict, parseable format.

Output format:
- Produce exactly two sections in this order:
  1) A section header "### reasoning" followed by your step-by-step solution and checks.
  2) A section header "### answer" followed by ONLY the final numeric answer as a plain integer (no LaTeX, no words, no punctuation, no boxes, no quotes, no units).
- The content under "### answer" must be a valid Python int literal. Do not include any additional text, symbols, or formatting. Example: 242

General guidelines:
- Use exact arithmetic whenever possible; avoid decimal approximations unless explicitly required. If the final result is an integer, do not present approximations.
- If the problem asks for a remainder modulo m, ensure your answer is the unique integer in [0, m−1].
- If the problem asks for m+n where a quantity equ

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [01:27<00:00, 29.09s/it] 

2025/10/07 22:20:33 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/07 22:22:31 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for predict: You are given a single math problem. Solve it correctly and return your work in a strict, parseable format.

Output format:
- Produce exactly two sections in this order:
  1) A section header "### reasoning" followed by your step-by-step solution and checks.
  2) A section header "### answer" followed by ONLY the final numeric answer as a plain integer (no LaTeX, no words, no punctuation, no boxes, no quotes, no units).
- The content under "### answer" must be a valid Python int literal. Do not include any additional text, symbols, or formatting. Example: 242

General guidelines:
- Use exact arithmetic throughout; avoid decimal approximations unless explicitly required. If surds or rational values occur mid-solution, keep them exact; simplify fully before the final answer.
- If the result is an integer (as in many contest problems), do not present approximations; finish with the exact intege

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [02:14<00:00, 45.00s/it]

2025/10/07 22:29:53 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)





2025/10/07 22:37:21 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for predict: You are given a single math problem. Solve it correctly and return your work in a strict, parseable format.

Output format:
- Produce exactly two sections in this order:
  1) A section header "### reasoning" followed by your step-by-step solution and checks.
  2) A section header "### answer" followed by ONLY the final numeric answer as a plain integer (no LaTeX, no words, no punctuation, no boxes, no quotes, no units).
- The content under "### answer" must be a valid Python int literal. Do not include any additional text, symbols, or formatting. Example: 242

General guidelines:
- Use exact arithmetic whenever possible; avoid decimal approximations unless explicitly required. If the final result is an integer, do not present approximations.
- If the problem defines m, n (e.g., expression m√n with n squarefree, or a fraction p/q reduced), ensure you interpret the request correctly (e.g., rep

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [01:20<00:00, 26.70s/it] 

2025/10/07 22:44:01 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/07 22:45:41 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for predict: You are given a single olympiad-style math problem. Solve it correctly and return your work in a strict, parseable format.

Output format:
- Produce exactly two sections in this order:
  1) A section header "### reasoning" followed by your step-by-step solution and checks.
  2) A section header "### answer" followed by ONLY the final numeric answer as a plain integer (no LaTeX, no words, no punctuation, no boxes, no quotes, no units).
- The content under "### answer" must be a valid Python int literal. Do not include any additional text, symbols, or formatting. Example: 242

General solution guidelines:
- Use exact arithmetic; avoid decimals unless required. If the final result is an integer, do not present approximations.
- If the problem asks for m+n or p+q from a canonical form (e.g., m√n with n squarefree, or a reduced fraction p/q), ensure you derive the correct canonical form before ex

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [01:53<00:00, 37.71s/it]

2025/10/07 22:53:47 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/07 22:56:04 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for predict: You are given a single math problem. Solve it correctly and return your work in a strict, parseable format.

Output format:
- Produce exactly two sections in this order:
  1) A section header "### reasoning" followed by your step-by-step solution and checks.
  2) A section header "### answer" followed by ONLY the final numeric answer as a plain integer (no LaTeX, no words, no punctuation, no boxes, no quotes, no units).
- The content under "### answer" must be a valid Python int literal with no leading zeros (except 0 itself). Do not include any additional text, symbols, or formatting. Example: 242

General guidelines:
- Use exact arithmetic throughout; avoid decimal approximations unless explicitly required. If surds or rational values occur mid-solution, keep them exact; simplify fully before the final answer.
- If the result is an integer (as in many contest problems), do not present appr

Average Metric: 1.00 / 1 (100.0%):  33%|███▎      | 1/3 [00:27<00:55, 27.72s/it]

In [None]:
print(optimized_program.predict.signature.instructions)

In [None]:
evaluate(optimized_program)