In [1]:
%load_ext autoreload
%autoreload 2

from env.constants import ANTHROPIC_API_KEY

## Preparing async runner

In [2]:
from run import run_research_agent_episode
import pandas as pd
from env.logger import get_logger
import time
import asyncio

logger = get_logger(__name__)


async def run_single_in_thread(api_key, prompt, sem):
    async with sem:
        return await asyncio.to_thread(run_research_agent_episode, api_key, prompt)


async def run_batch_async(api_key: str, prompt: str, num_runs: int = 10, max_concurrent: int = 10):
    logger.info(f"Running {num_runs} episodes. Max concurrent: {max_concurrent}")
    start_time = time.time()

    sem = asyncio.Semaphore(max_concurrent)
    tasks = [run_single_in_thread(api_key, prompt, sem) for _ in range(num_runs)]
    results = await asyncio.gather(*tasks, return_exceptions=True)

    total_time = time.time() - start_time
    logger.info(f"Total time: {total_time:.1f}s")
    return results

## Start testing with small model (haiku-4-5)

Running with env.constants.ANTHROPIC_MODEL = "haiku-4-5"

In [3]:
import env
from env.prompts import system_instruction_default, system_instruction_with_hints, system_instruction_with_hints_and_guidance

env.constants.ANTHROPIC_MODEL

'claude-sonnet-4-5'

In [None]:
results = await run_batch_async(api_key=ANTHROPIC_API_KEY, prompt=system_instruction_default)

In [5]:
df = pd.json_normalize(results)
success_rate = (df['verdict'] == 'SUCCESS').mean() * 100
logger.info(f"Success rate for {env.constants.ANTHROPIC_MODEL}: {success_rate}%")

[INFO]: Success rate for claude-haiku-4-5: 0.0%


In [6]:
for r in results:
	logger.info(r['reward_breakdown']['total_episode_reward'])

[INFO]: 0.018
[INFO]: 0.0
[INFO]: 0.0
[INFO]: 0.0
[INFO]: 0.0
[INFO]: 0.0277
[INFO]: 0.0
[INFO]: 0.0
[INFO]: 0.0086
[INFO]: 0.0166


Seems like this env is too hard for a small model. Let's first help the model with hints in the prompt to achieve better success rate.

I will be using `system_instruction_with_hints` prompt, which includes a few hints, more detailed description of the task and a hint how to solve it.

In [None]:
results = await run_batch_async(api_key=ANTHROPIC_API_KEY, prompt=system_instruction_with_hints)

In [8]:
df = pd.json_normalize(results)
success_rate = (df['verdict'] == 'SUCCESS').mean() * 100
logger.info(success_rate)

[INFO]: 0.0


In [9]:
for r in results:
	logger.info(r['reward_breakdown']['total_episode_reward'])

[INFO]: 0.4305
[INFO]: 0.0
[INFO]: 0.0
[INFO]: 0.0
[INFO]: 0.1847
[INFO]: 0.0
[INFO]: 0.8497
[INFO]: 0.0729
[INFO]: 0.3158
[INFO]: 0.0


In [21]:
logger.info(df['reward_breakdown.breakdown.total_behaviour_reward'])

[INFO]: 0     True
1     True
2    False
3     True
4     True
5     True
6     True
7     True
8     True
9    False
Name: reward_breakdown.breakdown.total_behaviour_reward, dtype: bool


Even if all the episodes formally failed (since they are <85% reward rate), model worked out much better this time, compared to the first run. If we change the reward threshold to 80%, we will see 10% success rate, which is good to go start the agent training using PPO or other RL frameworks.

However, in some cases it even failed the behavioural tests, such as budget exhausting and hallucinations (didn't call oracle or didn't use the python tooling).

Now let's run the experiment with even easier prompt, that includes all the guidance techniques, and pushes the model towards checking the budget everytime and pay attention to the final budget constraint $C = 6 * N * D$

In [None]:
results = await run_batch_async(api_key=ANTHROPIC_API_KEY, prompt=system_instruction_with_hints_and_guidance)

In [9]:
df = pd.json_normalize(results)
success_rate = (df['verdict'] == 'SUCCESS').mean() * 100
logger.info(success_rate)

[INFO]: 0.0


In [10]:
logger.info(success_rate)
for r in results:
	logger.info(r['reward_breakdown']['total_episode_reward'])

[INFO]: 0.0
[INFO]: 0.4379
[INFO]: 0.8195
[INFO]: 0.0
[INFO]: 0.0204
[INFO]: 0.0
[INFO]: 0.0
[INFO]: 0.0
[INFO]: 0.1172
[INFO]: 0.1837
[INFO]: 0.2367


## Running bigger model (sonnet-4-5)

Running with ANTHROPIC_MODEL = "claude-sonnet-4-5-20250929"

In [3]:
import env
from env.prompts import system_instruction_default

env.constants.ANTHROPIC_MODEL

'claude-sonnet-4-5-20250929'

Now let's try to check how bigger models will react to this environment.

In [None]:
results = await run_batch_async(api_key=ANTHROPIC_API_KEY, prompt=system_instruction_default)

In [5]:
df = pd.json_normalize(results)
success_rate = (df['verdict'] == 'SUCCESS').mean() * 100
logger.info(success_rate)

[INFO]: 0.0


In [6]:
logger.info(success_rate)
for r in results:
	logger.info(r['reward_breakdown']['total_episode_reward'])

[INFO]: 0.0
[INFO]: 0.0993
[INFO]: 0.0088
[INFO]: 0.4446
[INFO]: 0.026
[INFO]: 0.0949
[INFO]: 0.0
[INFO]: 0.5984
[INFO]: 0.0806
[INFO]: 0.5781
[INFO]: 0.7332


In [7]:
logger.info(df['reward_breakdown.breakdown.total_behaviour_reward'])

[INFO]: 0    True
1    True
2    True
3    True
4    True
5    True
6    True
7    True
8    True
9    True
Name: reward_breakdown.breakdown.total_behaviour_reward, dtype: bool


In [None]:
results = await run_batch_async(api_key=ANTHROPIC_API_KEY, prompt=system_instruction_with_hints)

In [6]:
df = pd.json_normalize(results)
success_rate = (df['verdict'] == 'SUCCESS').mean() * 100
logger.info(success_rate)

[INFO]: 50.0


In [9]:
for r in results:
	logger.info(r['reward_breakdown']['total_episode_reward'])

[INFO]: 0.5917
[INFO]: 0.903
[INFO]: 0.1143
[INFO]: 0.9705
[INFO]: 0.445
[INFO]: 0.9814
[INFO]: 0.5726
[INFO]: 0.6836
[INFO]: 0.9793
[INFO]: 1.0


Now it seems optimal for bigger model and we see good reward figures and success rate of 50%, which is good-to-start rate for the PPO training.

In [None]:
results = await run_batch_async(api_key=ANTHROPIC_API_KEY, prompt=system_instruction_with_hints_and_guidance)

In [5]:
df = pd.json_normalize(results)
success_rate = (df['verdict'] == 'SUCCESS').mean() * 100
logger.info(success_rate)

[INFO]: 70.0


In [6]:
for r in results:
	logger.info(r['reward_breakdown']['total_episode_reward'])

[INFO]: 0.5461
[INFO]: 0.9086
[INFO]: 0.994
[INFO]: 0.9603
[INFO]: 1.0
[INFO]: 0.9889
[INFO]: 0.75
[INFO]: 0.984
[INFO]: 0.7246
[INFO]: 1.0


Finally, we see that agent outperformed the task with 70% success rate, when we almost gave him the full solution and step-by-step guidance in the prompt. Even though the model is not perfect, so it's possible to fine-tune it further.

And at the same point, prompt appears to be close to complete guided instruction and further prompt enhancing will just lead to desired behaviour pursuing or data leakage in the prompt. It's almost impossible to optimize the prompt without adding more hints that will limit model's freedom.