# Prompt Optimization Example

In [1]:
!pip install nest_asyncio



In [2]:
import sys
import os
import asyncio
import time
import json
from pathlib import Path

from typing import Optional, Any
from pydantic import BaseModel, Field

from dotenv import load_dotenv

from agentics.core.agentics import Agentics as AG
from agentics.core.utils import chunk_list

import loguru
from huggingface_hub import snapshot_download

2025-09-11 10:17:36.359 | DEBUG    | agentics.core.llm_connections:<module>:90 - AGENTICS is connecting to the following LLM API providers:
2025-09-11 10:17:36.360 | DEBUG    | agentics.core.llm_connections:<module>:93 - 0 - WatsonX
2025-09-11 10:17:36.361 | DEBUG    | agentics.core.llm_connections:<module>:104 - Please add API keys in .env file to add or disconnect providers.
2025-09-11 10:17:36.384 | DEBUG    | agentics.core.llm_connections:get_llm_provider:29 - No LLM provider specified. Using the first available provider.
2025-09-11 10:17:36.385 | DEBUG    | agentics.core.llm_connections:get_llm_provider:31 - Available LLM providers: ['watsonx']. Using 'watsonx'
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import nest_asyncio
nest_asyncio.apply()

## Define Data Model for GSM8K Dataset

* Each problem in GSM8K dataset has question and answer fields. 
* In data folder, we provide the post-processed dataset that separates the thought in the think filed and the integer answer  in the numeric field.
* The response_think and response_answer are the output field, and correct is a slot to store if the response answer was correct.

### async function modify_dataset
* We can use `modify_dataset` as a mapping function to asynchronous map to post process the dataset.

In [4]:
class GSM8K(BaseModel):
    question: Optional[str] = Field(None, description="a grade school math question.")
    answer: Optional[str] = Field(None, description="the ground-truth answer to the question including the reasoning, and #### formating.")
    think: Optional[str] = Field(None, description="the step by step reasoning process to derive answer.")
    numeric: Optional[str] = Field(None, description="the number extracted from the final answer to compare with the response answer.")
    response_think: Optional[str] = Field(None, description="the step by step reasoning of response, usually between <think> and </think> tags in CoT prompting.")
    response_answer: Optional[str] = Field(None, description="the number extracted from the final answer to the question that ignores units, etc.")
    correct: Optional[bool] = Field(None, description="place holder for storing True if the answer in the response was correct.")
    
    @staticmethod
    async def grade(state: "GSM8K")->"GSM8K":
        extracted_answer = GSM8K.regex_extract_answer(state.response_answer)
        state.correct = (state.numeric == extracted_answer)
        return state

    @staticmethod
    async def modify_dataset(state: "GSM8K")->"GSM8K":
        think_temp, num_temp = state.answer.split("####")
        state.think = think_temp.strip()
        state.numeric = GSM8K.regex_extract_answer(num_temp.strip())
        return state

    @staticmethod
    def regex_extract_answer(expr:str)->str:
        import re
        ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
        ANS_RE2 = re.compile(r"(\-?[0-9\.\,]+)")
        INVALID_ANS = "[invalid]"    
        match = ANS_RE.search(expr)
        if match:
            match_str = match.group(1).strip()
            match_str = match_str.replace(",", "")
            return match_str
        else:
            match2 = ANS_RE2.search(expr.strip())
            if match2:
                match_str = match2.group(1).strip()
                match_str = match_str.replace(",", "")
                return match_str
        return INVALID_ANS

## Meta Prompt for Prompt Optimization

* We demonstrate the prompt optimization method that uses meta-prompt (Large Language Model As Optimizers, Yang et al 2024).
* In this example, we optimzie the system prompt by searching the "role", "goal", "expected output", and "imperative" sentence.

In [5]:
OPT_META_INSTRUCTION = """Your proposed prompt template will be used in the following way.
* You are "role" -- this role must be suitable for solving the demo task.
* Your personal goal is: "goal" -- the goal achieves the outputs given inputs.
* This is the expected criteria for your final answer "expected_output" -- this constrains the output format.
* You can add a short imperative instruction "imperative" -- this comes after the input of the task.

[[Several demo tasks of input and outputs will be provided when you solve problem.]]

[[The previous optimized prompt templates with scores appear from the worst to the best.]]
{optimization_history}

* Given the previous optimization results, don't generate duplicate or similar prompt templates.
* Generate prompt template that achieves the best score, and succint and concise instructions.
"""


USER_PROMPT_TEMPLATE = """
You are {role}.
Your personal goal is: {goal}.
This is the expected criteria for your final answer: {expected_output}.

solve the following task.
{question}

{imperative}
"""

## Define Data Model for Prompt Optimization Task

* The following cell defines the data model for optimization task.
* The optimization task has demo tasks from the training set.
* The role, goal, expected output, imperative are the slots for storing the response from LLMs.
* The score stores the evaluated score for the proposed prompt.

### Type manipulations 

* In `create_optimization_demos`, the method takes the training data set.
* The demo data set is created by cloning it and making modification to the underlying aType.
* It create a subtype with two fields using `subset_atype` and rebind the type to the demo dataset using `rebind_atype`.
* When we need to modify the underlying data type for AG class, we can modify the data type and rebind it to the AG.


In [6]:
class OptimizationTask(BaseModel):
    demos: Optional[list[Any]] = Field(None, description="optimization demo tasks to undertand the problem domain")
    role: Optional[str] = Field(None, description="New role instruction suggested by LLM")
    goal: Optional[str] = Field(None, description="New goal instruction suggested by LLM")
    expected_output: Optional[str] = Field(None, description="New expected_output instruction suggested by LLM")
    imperative: Optional[str] = Field(None, description="New imperative suggested by LLM")
    score: Optional[int] = Field(None, description="evaluation score of optimization output")

    @staticmethod
    def create_optimization_demos(dataset: AG, num_demos:int)->list[list[BaseModel]]:
        demoset = dataset.clone()
        demoset = demoset.rebind_atype(new_atype=demoset.subset_atype(include_fields={"question", "numeric"}))
        return chunk_list(demoset.states, chunk_size=num_demos)
    
    @classmethod
    def create_optimization_tasks(cls, demo_list: list[list[BaseModel]])->list["OptimizationTask"]:
        # take the list of Demos from chunk list, make 1 task 
        optimization_tasks = []
        for demos in demo_list:
            optimization_tasks.append(OptimizationTask(demos=demos))
        return optimization_tasks

    @staticmethod
    def remove_duplicates(optimization_history: list["OptimizationTask"]):
        sorted_history = sorted(optimization_history, key=lambda x: x.score, reverse=False)      # ascending
        kept_history = []
        for current_best in reversed(sorted_history):       # keep better score
            for kept_task in kept_history:
                if current_best == kept_task:
                    break
            else:
                kept_history.append(current_best)
        return list(reversed(kept_history))         # return ascending order
        
    def __eq__(self, other):
        return self.role == other.role and \
               self.goal == other.goal and \
               self.expected_output == other.expected_output and \
               self.imperative == other.imperative and \
               self.score == other.score
    
    @staticmethod
    def get_history_string(optimization_history: list["OptimizationTask"]):
        history_str = ""
        for optimized_task in optimization_history:
            history_str += (
                optimized_task.model_dump_json(exclude={"demos"}, indent=2)
                + "\n"
            )
        return history_str

In [7]:
def report(dataset: AG, report_name:str="test", first_n:int=0, dump_report:bool=False):
    dataset = dataset.truncate_states(first_n, len(dataset))
    total = len(dataset)
    dataset.filter(func=lambda state: state.correct)
    correct = len(dataset)
    summary = {
        "report_name": report_name,
        "total": total,
        "fewshots": first_n,
        "correct": correct,
        "ratio": "{:.4f}".format(correct/total),
        "score": int(100*correct/total),
    }
    if dump_report:
        with open(Path(__file__).parent/"output"/"report.jsonl", 'a') as fp:
            fp.write(json.dumps(summary) + "\n")
        print(json.dumps(summary, indent=4))
    return summary

### Setting AG parameters

* For AG instances, we can directly modify its parameters by accessing its field.
* The prompt related parameters are `instructions`, `prompt_template`, and `crew_prompt_params`.
* In the following cell, we make all prompt strings empty as initialization step.

In [8]:
def set_default_params(args, agentic:AG)->AG:
    agentic.batch_size = args.batch_size
    agentic.verbose_agent = args.verbose
    agentic.verbose_transduction = args.verbose
    agentic.skip_intensional_definiton = True
    return agentic


def set_prompt_null(agentic:AG)->AG:
    agentic.instructions = ""
    agentic.prompt_template = ""
    agentic.crew_prompt_params = {
        "role": "",
        "goal": "",
        "backstory": "",
        "expected_output": ""
    }
    return agentic

## Initialize Arguments

The following cell introduces additional arguments for the prompt optimization.

In [9]:
class Args(BaseModel):
    num_opts: int = Field(2, description="Total number of optimizers")
    num_demos: int = Field(3, description="Number of demo tasks or problems to show")
    num_trains: int = Field(6, description="Number of train examples = num_opts * num_demos")
    num_devs: int = Field(20, description="Size of devsets to evaluate proposed prompts")
    test_size: Optional[int] = Field(None, description="Size of test set; None uses all")
    train_size: int = Field(500, description="Size of train set")
    llm_model: str = Field("watsonx/meta-llama/llama-3-3-70b-instruct", description="WatsonX LLM model name")
    verbose: bool = Field(False, description="Enable verbose output")
    batch_size: int = Field(5, description="Batch size for transduction")
    best_k: int = Field(8, description="Maintain best-k prompts during optimization")
    max_iter: int = Field(2, description="Maximum number of optimization iterations")
    prompt_file: str = Field("gsm8k_optimized_prompts.jsonl", description="Path to prompt file")
    best_m: int = Field(5, description="Store best-m prompts")
    max_tokens: int = Field(4000, description="Max output token length; input + output < total allowed tokens")
    exp_name: Optional[str] = Field(None, description="Experiment name")
    early_stop_iter: int = Field(2, description="esacpe optimization loop if score doesn't improve")

In [10]:
args = Args()
args.num_demos = 5
args.num_devs = 10
args.test_size = 10
args.verbose = True
args.batch_size = 2
args.best_k = 4
args.best_m = 2
args.early_stop_iter = 2

In [11]:
loguru.logger.remove()
loguru.logger.add(sys.stdout, format="{time} {level} {message}")
logger_file = "gsm8k_opt.logs" if args.exp_name is None else f"gsm8k_opt_{args.exp_name}.logs"
loguru.logger.add(f"logs/{logger_file}", format="{time} {level} {message}")

2

## Load Train Set to AG[GSM8K]

In [12]:
gsm8k_dir = snapshot_download(repo_id="junkyul/gsm8k", repo_type="dataset")
trainset = AG.from_jsonl(os.path.join(gsm8k_dir, "train.jsonl"), GSM8K, jsonl=True, max_rows=args.train_size)

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 15087.42it/s]


In [13]:
set_default_params(args, trainset)
set_prompt_null(trainset)
trainset.prompt_template = USER_PROMPT_TEMPLATE

## Create LLM Clients for Generation and Evaluation

* Here, we assume that we use watsonx served models.
* The `.env` file stores necessary environment variables.
* We use temperature 1.0 for prompt generation and 0.0 for prompt evaluation.

In [14]:
eval_llm = AG.create_crewai_llm(model=args.llm_model, 
                                base_url=os.getenv("WATSONX_URL"),
                                project_id=os.getenv("WATSONX_PROJECTID"),
                                max_tokens=args.max_tokens,
                                temperature=0.0)
gen_llm = AG.create_crewai_llm(model=args.llm_model, 
                                base_url=os.getenv("WATSONX_URL"),
                                project_id=os.getenv("WATSONX_PROJECTID"),
                                max_tokens=args.max_tokens,
                                temperature=1.0)

In [15]:
loguru.logger.info("## start optimization.")
evaluation_time = 0
optimized_tasks = []
best_test_score = 0
current_best_score = 0
no_improvment_count = 0

2025-09-11T10:17:37.495276-0400 INFO ## start optimization.


## Prompt Optimization Loop

The prompt optimization loop has four parts
1. create optimizer AGs by shuffle the training set and create demo problems
2. transduce the prompt candidates
3. execute the generated prompts on dev set
4. grade the candidate prompts on the dev set

In this Notebook, we parallelize the optimization loop with logical transduction algebra
### Product of two AGs
* `opt_eval = optimizer.product(eval)` correspond to AG[OptimizationTask] x AG[GSM8K]
* This product AG maintains two state lists (internally it maintains a flattend list)
* This allows parallelizing the transduction in the next step

### Quotient of AGs
* `evalsets = eval.quotient(opt_eval)` correspond to AG[(OptimizationTask, GSM8K)/OptimizationTask]
* This allows grading the result of each prompt candidate in the next step using asynchronous MAP


In [16]:
t0 = time.time()
for iter_ind in range(args.max_iter):
    iter_t0 = time.time()
    loguru.logger.info(f"################################")
    loguru.logger.info(f"#### iter {iter_ind}")
    loguru.logger.info(f"#### 1. create optimizer AGs")
    # shuffle the train set and truncate num_trains for demonstration
    shuffled_trainset = trainset.get_random_sample(percent=1.0)
    demosets = shuffled_trainset.clone().truncate_states(0, args.num_trains)
    # create a list of OptimizationTask objects with a list of demos
    chunked_demos = OptimizationTask.create_optimization_demos(demosets, num_demos=args.num_demos)
    optimization_tasks = OptimizationTask.create_optimization_tasks(chunked_demos)
    optimizer = AG.from_states(optimization_tasks, atype=OptimizationTask)
    set_default_params(args, optimizer)
    optimizer.llm = gen_llm
    # set the prompts for the optimizer AG
    optimizer.instructions = OPT_META_INSTRUCTION.format(
        optimization_history = OptimizationTask.get_history_string(optimized_tasks)
    )
    optimizer.prompt_template = """{{"demo tasks":{demos}}}"""
    optimizer.crew_prompt_params = {
        "role": "Prompt optimizer.",
        "goal": "Propose diverse prompt templates that achieves high performance for the demo task given as input.",
        "backstory": "Understand the problem domain given the demo task example and propose what answer should be generated.",
        "expected_output": "the outputs are role, goal, and the expected output description, and imperative sentence for solving provided tasks."
    }

    loguru.logger.info(f"#### 2. generate {args.num_opts} prompts at iter {iter_ind}")
    optimizer = asyncio.run(optimizer.self_transduction(["demos"], ["role", "goal", "expected_output", "imperative"]))
        
    loguru.logger.info(f"#### 3. evalaute transduced {args.num_opts} prompts")
    eval = shuffled_trainset.clone().truncate_states(args.num_trains, args.num_trains + args.num_devs)
    # opt_eval AG is a product of the optimizer AG and the eval AG
    # we internally maintain the pair (optimizer states, eval states) as a flattened list
    opt_eval = optimizer.product(eval)
    set_default_params(args, opt_eval)
    opt_eval.llm = eval_llm
    opt_eval.prompt_template = USER_PROMPT_TEMPLATE
    # self-transduction applies to the combinations of the optimizer and eval states
    opt_eval = asyncio.run(opt_eval.self_transduction(["role", "goal", "expected_output", "imperative", "question"], ["response_think", "response_answer"]))

    loguru.logger.info(f"#### 4. grade responses from {args.num_opts} prompts")
    # quotient divide the evaluated opt_eval and returns the evaluated results as AG
    evalsets = eval.quotient(opt_eval)
    optimizer_scores = []
    for ind, evalset in enumerate(evalsets):
        # apply asychronous map to grade the responses
        evalset = asyncio.run(evalset.amap(GSM8K.grade))    
        summary = report(evalset, report_name=f"optimizer {ind+1}")
        optimizer_scores.append(summary["score"])
        setattr(optimizer[ind], "score", summary["score"])

    loguru.logger.info("#### store best_k prompts found so far")
    # keep the result of prompt optimization in a separte list
    # remove duplicated prompts if exists and sort them by the dev set evaluation score.
    optimized_tasks.extend(optimizer.states)
    optimized_tasks = OptimizationTask.remove_duplicates(optimized_tasks)
    optimized_tasks = optimized_tasks[-args.best_k:]
    current_best_score = optimized_tasks[-1].score

    loguru.logger.info(f"[[TIME]]::ITERATION::{iter_ind+1}={time.time()-iter_t0}")
    loguru.logger.info(f"[[DEV SCORE]]::ITERATION::{iter_ind+1}={current_best_score}")

2025-09-11T10:17:37.519527-0400 INFO ################################
2025-09-11T10:17:37.521412-0400 INFO #### iter 0
2025-09-11T10:17:37.522284-0400 INFO #### 1. create optimizer AGs
2025-09-11T10:17:37.544320-0400 INFO #### 2. generate 2 prompts at iter 0
2025-09-11T10:17:37.547792-0400 DEBUG Executing task: Your proposed prompt template will be used in the following way.
* You are "role" -- this role must be suitable for solving the demo task.
* Your personal goal is: "goal" -- the goal achieves the outputs given inputs.
* This is the expected criteria for your final answer "expected_output" -- this constrains the output format.
* You can add a short imperative instruction "imperative" -- this comes after the input of the task.

[[Several demo tasks of input and outputs will be provided when you solve problem.]]

[[The previous optimized prompt templates with scores appear from the worst to the best.]]


* Given the previous optimization results, don't generate duplicate or similar

2025-09-11T10:17:42.234890-0400 DEBUG Processed 2 states in 4.436641693115234 seconds
2025-09-11T10:17:42.236287-0400 DEBUG 2 states processed in 2.218320846557617 seconds average per state ...
2025-09-11T10:17:42.237739-0400 INFO #### 3. evalaute transduced 2 prompts
2025-09-11T10:17:42.258780-0400 DEBUG Executing task: Generate an object of the specified type from the following input.
20 states will be transduced
2025-09-11T10:17:42.267068-0400 DEBUG transducer class: <class 'agentics.abstractions.pydantic_transducer.PydanticTransducerCrewAI'>


2025-09-11T10:17:46.935617-0400 DEBUG Processed 2 states in 4.66737961769104 seconds
2025-09-11T10:17:46.936720-0400 DEBUG 2 states processed in 2.33368980884552 seconds average per state ...


2025-09-11T10:17:53.156450-0400 DEBUG Processed 2 states in 6.218783378601074 seconds
2025-09-11T10:17:53.157816-0400 DEBUG 4 states processed in 3.109391689300537 seconds average per state ...


2025-09-11T10:17:58.398871-0400 DEBUG Processed 2 states in 5.239844560623169 seconds
2025-09-11T10:17:58.400682-0400 DEBUG 6 states processed in 2.6199222803115845 seconds average per state ...


2025-09-11T10:18:03.632960-0400 DEBUG Processed 2 states in 5.230771780014038 seconds
2025-09-11T10:18:03.634426-0400 DEBUG 8 states processed in 2.615385890007019 seconds average per state ...


2025-09-11T10:18:08.070458-0400 DEBUG Processed 2 states in 4.434732913970947 seconds
2025-09-11T10:18:08.071667-0400 DEBUG 10 states processed in 2.2173664569854736 seconds average per state ...


2025-09-11T10:18:15.426374-0400 DEBUG Processed 2 states in 7.353562593460083 seconds
2025-09-11T10:18:15.428146-0400 DEBUG 12 states processed in 3.6767812967300415 seconds average per state ...


2025-09-11T10:18:23.214465-0400 DEBUG Processed 2 states in 7.785249710083008 seconds
2025-09-11T10:18:23.216082-0400 DEBUG 14 states processed in 3.892624855041504 seconds average per state ...


2025-09-11T10:18:33.132991-0400 DEBUG Processed 2 states in 9.916010618209839 seconds
2025-09-11T10:18:33.134441-0400 DEBUG 16 states processed in 4.958005309104919 seconds average per state ...


2025-09-11T10:18:43.093132-0400 DEBUG Processed 2 states in 9.9574875831604 seconds
2025-09-11T10:18:43.094666-0400 DEBUG 18 states processed in 4.9787437915802 seconds average per state ...


2025-09-11T10:18:50.555741-0400 DEBUG Processed 2 states in 7.459740161895752 seconds
2025-09-11T10:18:50.557014-0400 DEBUG 20 states processed in 3.729870080947876 seconds average per state ...
2025-09-11T10:18:50.559022-0400 INFO #### 4. grade responses from 2 prompts
2025-09-11T10:18:50.561059-0400 DEBUG Executing amap on function <function GSM8K.grade at 0x7f30fe5b39c0>
2025-09-11T10:18:50.563871-0400 DEBUG 2 states processed. 0.0008177757263183594 seconds average per state in the last chunk ...
2025-09-11T10:18:50.565152-0400 DEBUG 4 states processed. 0.00013899803161621094 seconds average per state in the last chunk ...
2025-09-11T10:18:50.566415-0400 DEBUG 6 states processed. 0.00011348724365234375 seconds average per state in the last chunk ...
2025-09-11T10:18:50.567657-0400 DEBUG 8 states processed. 0.00010156631469726562 seconds average per state in the last chunk ...
2025-09-11T10:18:50.568823-0400 DEBUG 10 states processed. 9.989738464355469e-05 seconds average per state i

2025-09-11T10:18:53.382095-0400 DEBUG Processed 2 states in 2.766263484954834 seconds
2025-09-11T10:18:53.383393-0400 DEBUG 2 states processed in 1.383131742477417 seconds average per state ...
2025-09-11T10:18:53.384892-0400 INFO #### 3. evalaute transduced 2 prompts
2025-09-11T10:18:53.411898-0400 DEBUG Executing task: Generate an object of the specified type from the following input.
20 states will be transduced
2025-09-11T10:18:53.419908-0400 DEBUG transducer class: <class 'agentics.abstractions.pydantic_transducer.PydanticTransducerCrewAI'>


2025-09-11T10:18:57.990443-0400 DEBUG Processed 2 states in 4.569369077682495 seconds
2025-09-11T10:18:57.991836-0400 DEBUG 2 states processed in 2.2846845388412476 seconds average per state ...


2025-09-11T10:19:04.740154-0400 DEBUG Processed 2 states in 6.746839284896851 seconds
2025-09-11T10:19:04.741739-0400 DEBUG 4 states processed in 3.3734196424484253 seconds average per state ...


2025-09-11T10:19:11.573791-0400 DEBUG Processed 2 states in 6.830214500427246 seconds
2025-09-11T10:19:11.575010-0400 DEBUG 6 states processed in 3.415107250213623 seconds average per state ...


2025-09-11T10:19:16.231772-0400 DEBUG Processed 2 states in 4.655738830566406 seconds
2025-09-11T10:19:16.232974-0400 DEBUG 8 states processed in 2.327869415283203 seconds average per state ...


2025-09-11T10:19:22.320448-0400 DEBUG Processed 2 states in 6.086440563201904 seconds
2025-09-11T10:19:22.322066-0400 DEBUG 10 states processed in 3.043220281600952 seconds average per state ...


2025-09-11T10:19:31.776072-0400 DEBUG Processed 2 states in 9.452473402023315 seconds
2025-09-11T10:19:31.777235-0400 DEBUG 12 states processed in 4.726236701011658 seconds average per state ...


2025-09-11T10:19:38.803539-0400 DEBUG Processed 2 states in 7.024869203567505 seconds
2025-09-11T10:19:38.804952-0400 DEBUG 14 states processed in 3.5124346017837524 seconds average per state ...


2025-09-11T10:19:47.122127-0400 DEBUG Processed 2 states in 8.315857410430908 seconds
2025-09-11T10:19:47.123459-0400 DEBUG 16 states processed in 4.157928705215454 seconds average per state ...


2025-09-11T10:19:51.917651-0400 DEBUG Processed 2 states in 4.792920827865601 seconds
2025-09-11T10:19:51.919177-0400 DEBUG 18 states processed in 2.3964604139328003 seconds average per state ...


2025-09-11T10:19:57.062525-0400 DEBUG Processed 2 states in 5.14180064201355 seconds
2025-09-11T10:19:57.063745-0400 DEBUG 20 states processed in 2.570900321006775 seconds average per state ...
2025-09-11T10:19:57.065670-0400 INFO #### 4. grade responses from 2 prompts
2025-09-11T10:19:57.068274-0400 DEBUG Executing amap on function <function GSM8K.grade at 0x7f30fe5b39c0>
2025-09-11T10:19:57.069785-0400 DEBUG 2 states processed. 0.0002835988998413086 seconds average per state in the last chunk ...
2025-09-11T10:19:57.071051-0400 DEBUG 4 states processed. 0.00011348724365234375 seconds average per state in the last chunk ...
2025-09-11T10:19:57.072576-0400 DEBUG 6 states processed. 0.00011587142944335938 seconds average per state in the last chunk ...
2025-09-11T10:19:57.073748-0400 DEBUG 8 states processed. 0.00011324882507324219 seconds average per state in the last chunk ...
2025-09-11T10:19:57.074704-0400 DEBUG 10 states processed. 0.00010216236114501953 seconds average per state i

## Display Optimized Prompts

In [17]:
optimized_tasks = OptimizationTask.remove_duplicates(optimized_tasks)
loguru.logger.info(f"## optimization results final {args.best_m} selections")
for ind, optimized_task in enumerate(optimized_tasks[-args.best_m:]):
    res = optimized_task.model_dump_json(exclude={"demos"}, indent=4)
    loguru.logger.info(f"## {ind+1}-th:\n{res}")
best_optimizers = AG.from_states(optimized_tasks[-args.best_m:], atype=OptimizationTask)

2025-09-11T10:19:57.095652-0400 INFO ## optimization results final 2 selections
2025-09-11T10:19:57.097702-0400 INFO ## 1-th:
{
    "role": "Math Problem Specialist",
    "goal": "To accurately calculate the numerical answer to the given mathematical problem",
    "expected_output": "A precise numerical value that solves the problem",
    "imperative": "Read the problem statement carefully, identify the key mathematical elements, and apply the relevant mathematical operations to arrive at the correct numerical answer",
    "score": 60
}
2025-09-11T10:19:57.098881-0400 INFO ## 2-th:
{
    "role": "Data Analyst",
    "goal": "To calculate the average value of a given set of numbers",
    "expected_output": "a numeric value representing the average",
    "imperative": "Add up all the numbers in the set and divide by the total count of numbers to find the average",
    "score": 70
}


## Evaluate Optimized Propmt In The Test Set

In [18]:
testset = AG.from_jsonl(os.path.join(gsm8k_dir, "test.jsonl"), GSM8K, jsonl=True, max_rows=args.test_size)
final_eval = best_optimizers.product(testset)
set_default_params(args, final_eval)
set_prompt_null(final_eval)

final_eval.llm = eval_llm
final_eval.llm.temperature = 0.0                
final_eval = asyncio.run(final_eval.self_transduction(["role", "goal", "expected_output", "imperative", "question"], ["response_think", "response_answer"]))

evalsets = testset.quotient(final_eval)
optimizer_scores = []
for ind, evalset in enumerate(evalsets):
    evalset = asyncio.run(evalset.amap(GSM8K.grade))    
    summary = report(evalset, report_name=f"optimizer-test {ind+1}")
    optimizer_scores.append(summary["score"])
    setattr(best_optimizers[ind], "score", summary["score"]) 

2025-09-11T10:19:57.136849-0400 DEBUG Executing task: 
20 states will be transduced
2025-09-11T10:19:57.138924-0400 DEBUG transducer class: <class 'agentics.abstractions.pydantic_transducer.PydanticTransducerCrewAI'>


2025-09-11T10:20:03.487740-0400 DEBUG Processed 2 states in 6.347774505615234 seconds
2025-09-11T10:20:03.488965-0400 DEBUG 2 states processed in 3.173887252807617 seconds average per state ...


2025-09-11T10:20:12.707005-0400 DEBUG Processed 2 states in 9.21710753440857 seconds
2025-09-11T10:20:12.708619-0400 DEBUG 4 states processed in 4.608553767204285 seconds average per state ...


2025-09-11T10:20:17.874345-0400 DEBUG Processed 2 states in 5.163955211639404 seconds
2025-09-11T10:20:17.875650-0400 DEBUG 6 states processed in 2.581977605819702 seconds average per state ...


2025-09-11T10:20:23.797195-0400 DEBUG Processed 2 states in 5.920292615890503 seconds
2025-09-11T10:20:23.798490-0400 DEBUG 8 states processed in 2.9601463079452515 seconds average per state ...


2025-09-11T10:20:32.750452-0400 DEBUG Processed 2 states in 8.95060682296753 seconds
2025-09-11T10:20:32.751776-0400 DEBUG 10 states processed in 4.475303411483765 seconds average per state ...


2025-09-11T10:20:37.858204-0400 DEBUG Processed 2 states in 5.105194807052612 seconds
2025-09-11T10:20:37.859521-0400 DEBUG 12 states processed in 2.552597403526306 seconds average per state ...


2025-09-11T10:20:43.291556-0400 DEBUG Processed 2 states in 5.4307026863098145 seconds
2025-09-11T10:20:43.292629-0400 DEBUG 14 states processed in 2.7153513431549072 seconds average per state ...


2025-09-11T10:20:51.427680-0400 DEBUG Processed 2 states in 8.13405466079712 seconds
2025-09-11T10:20:51.429046-0400 DEBUG 16 states processed in 4.06702733039856 seconds average per state ...


2025-09-11T10:20:57.498651-0400 DEBUG Processed 2 states in 6.068507194519043 seconds
2025-09-11T10:20:57.499698-0400 DEBUG 18 states processed in 3.0342535972595215 seconds average per state ...


2025-09-11T10:21:04.024955-0400 DEBUG Processed 2 states in 6.524186134338379 seconds
2025-09-11T10:21:04.026144-0400 DEBUG 20 states processed in 3.2620930671691895 seconds average per state ...
2025-09-11T10:21:04.029499-0400 DEBUG Executing amap on function <function GSM8K.grade at 0x7f30fe5b39c0>
2025-09-11T10:21:04.031580-0400 DEBUG 10 states processed. 2.6237964630126952e-05 seconds average per state in the last chunk ...
2025-09-11T10:21:04.032790-0400 DEBUG Executing amap on function <function GSM8K.grade at 0x7f30fe5b39c0>
2025-09-11T10:21:04.034504-0400 DEBUG 10 states processed. 2.8192996978759766e-05 seconds average per state in the last chunk ...


In [19]:
loguru.logger.info(f"################################")
loguru.logger.info(f"# best test score:{optimizer_scores}")
loguru.logger.info(f"################################")   

2025-09-11T10:21:04.044695-0400 INFO ################################
2025-09-11T10:21:04.046355-0400 INFO # best test score:[80, 70]
2025-09-11T10:21:04.047970-0400 INFO ################################
