# Experiment Runner


- **Experiment 1 (Baseline)**: Zephyr 7B — Commander-only vs Commander+subagents
- **Experiment 2 (Model Comparison)**: Zephyr 7B, Mistral 7B v0.3, Llama 3.1 8B — 2 configurations each

**Prompting technique**: Step-Back Prompting + Chain of Thought

**Model settings**: temperature=0.2, top_k=30, top_p=0.95, max_tokens=800

## 0. Imports & Setup

In [1]:
import sys
import yaml
import time
import json
from pathlib import Path
from datetime import datetime

# Ensure project root is on the Python path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from core.Agent import Agent
from core.Logbook import Logbook
from core.Tools import ask_human, agent_as_tool, kb_search

from experiments.experiment_config import (
    OBJECTIVE, EXPERIMENT_SETTINGS, MODELS, SUBAGENT_CONFIGS,
    EXPERIMENT_1_RUNS, EXPERIMENT_2_RUNS, ALL_RUNS,
)
from experiments.experiment_results import ExperimentResult, save_result

print(f"Project root: {project_root}")
print(f"Models: {list(MODELS.keys())}")
print(f"Total runs planned: {len(ALL_RUNS)}")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ImportError: DLL load failed while importing _multiarray_umath: Kan opgegeven module niet vinden.

RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
numpy._core.multiarray failed to import

## 1. Load Prompt Template

In [None]:
template = Path("prompts/stepback_cot_prompt.txt").read_text(encoding="utf-8")
prompt = template.format(objective=OBJECTIVE)

print("=" * 70)
print("FINAL PROMPT (sent to Commander)")
print("=" * 70)
print(prompt)

## 2. Define `run_experiment()` Function

In [None]:
def run_experiment(run_config: dict, prompt: str) -> ExperimentResult:
    """
    Execute a single experiment run.

    1. Creates a fresh Logbook.
    2. Loads Commander config and overrides model + model_settings.
    3. Optionally creates subagents (also with overridden model).
    4. Runs commander.query(prompt) and measures wall-clock time.
    5. Saves result to experiments/results/{experiment_id}/.
    6. Returns structured ExperimentResult.
    """
    model_name = MODELS[run_config["model_key"]]

    # Fresh logbook per run
    logbook = Logbook()

    # Load commander config and inject experiment model + settings
    cmd_config = yaml.safe_load(
        Path(run_config["commander_yaml"]).read_text(encoding="utf-8")
    )
    cmd_config["model"] = model_name
    cmd_config["model_settings"] = EXPERIMENT_SETTINGS

    # Build tools list
    tools = [ask_human]

    if run_config["use_subagents"]:
        for sub_cfg in SUBAGENT_CONFIGS:
            sub_config = yaml.safe_load(
                Path(sub_cfg["yaml_path"]).read_text(encoding="utf-8")
            )
            sub_config["model"] = model_name
            sub_config["model_settings"] = EXPERIMENT_SETTINGS

            search_tool = kb_search(sub_cfg["kb_path"], k=3)
            sub_agent = Agent(
                config=sub_config,
                logbook=logbook,
                tools=[ask_human, search_tool],
            )
            tools.append(agent_as_tool(sub_agent, sub_cfg["tool_name"]))

    # Create commander
    commander = Agent(config=cmd_config, logbook=logbook, tools=tools)

    # Run with timing
    start = time.time()
    output = commander.query(prompt, max_steps=8)
    duration = time.time() - start

    # Build result
    result = ExperimentResult(
        experiment_id=run_config["experiment_id"],
        phase=run_config["phase"],
        model_name=model_name,
        agent_config=run_config["agent_config"],
        prompt=prompt,
        model_settings=EXPERIMENT_SETTINGS,
        output=output,
        trace=[{
            "agent": s.agent,
            "prompt": s.prompt,
            "thought": s.thought,
            "action": s.action,
            "line": s.line,
            "parse_error": s.parse_error,
            "raw": s.raw,
        } for s in logbook.trace],
        timestamp=datetime.now().isoformat(timespec="seconds"),
        duration_seconds=round(duration, 2),
    )

    # Persist
    save_result(result, logbook)
    return result

---
## 3. Experiment 1 — Baseline (Zephyr 7B)

| ID | Configuration |
|---|---|
| EXP1-001 | Commander-only |
| EXP1-002 | Commander + subagents |

In [None]:
exp1_results = []

for run_cfg in EXPERIMENT_1_RUNS:
    print(f"\n{'=' * 70}")
    print(f"  {run_cfg['experiment_id']} | {MODELS[run_cfg['model_key']]} | {run_cfg['agent_config']}")
    print(f"{'=' * 70}\n")

    result = run_experiment(run_cfg, prompt)
    exp1_results.append(result)

    print(f"\n  Completed in {result.duration_seconds}s")
    print(f"  Output: {result.output[:120]}...")

### Experiment 1 — Results Summary

In [None]:
print(f"{'ID':<12} {'Config':<25} {'Duration (s)':<15} {'Steps':<8} {'Output (first 80 chars)'}")
print("-" * 120)
for r in exp1_results:
    steps = len(r.trace)
    out_short = r.output[:80].replace("\n", " ")
    print(f"{r.experiment_id:<12} {r.agent_config:<25} {r.duration_seconds:<15} {steps:<8} {out_short}")

---
## 4. Experiment 2 — Model Comparison

| ID | Model | Configuration |
|---|---|---|
| EXP2-001 | Zephyr 7B | Commander-only |
| EXP2-002 | Zephyr 7B | Commander + subagents |
| EXP2-003 | Mistral 7B v0.3 | Commander-only |
| EXP2-004 | Mistral 7B v0.3 | Commander + subagents |
| EXP2-005 | Llama 3.1 8B | Commander-only |
| EXP2-006 | Llama 3.1 8B | Commander + subagents |

In [None]:
exp2_results = []

for run_cfg in EXPERIMENT_2_RUNS:
    print(f"\n{'=' * 70}")
    print(f"  {run_cfg['experiment_id']} | {MODELS[run_cfg['model_key']]} | {run_cfg['agent_config']}")
    print(f"{'=' * 70}\n")

    result = run_experiment(run_cfg, prompt)
    exp2_results.append(result)

    print(f"\n  Completed in {result.duration_seconds}s")
    print(f"  Output: {result.output[:120]}...")

### Experiment 2 — Results Summary

In [None]:
print(f"{'ID':<12} {'Model':<45} {'Config':<25} {'Duration (s)':<15} {'Steps':<8}")
print("-" * 120)
for r in exp2_results:
    steps = len(r.trace)
    model_short = r.model_name.split("/")[-1]
    print(f"{r.experiment_id:<12} {model_short:<45} {r.agent_config:<25} {r.duration_seconds:<15} {steps:<8}")

---
## 5. Evaluation

Score each run on the following criteria (1-5 scale):

| Criterium | Beschrijving |
|---|---|
| **Output Quality** | Is de beslissing helder, goed onderbouwd en uitvoerbaar? |
| **Consistency** | Zijn de stappen logisch consistent en vrij van tegenspraken? |
| **Reasoning Depth** | Hoe diep en breed analyseert het model de scenario-dimensies? |
| **Differences** | Welke opvallende verschillen zijn er t.o.v. andere runs? |

In [None]:
# Fill in evaluations for each result.
# Uncomment and adjust for each run after reviewing the traces.

all_results = exp1_results + exp2_results

# Example:
# all_results[0].observations = "Commander redeneerde direct zonder experts, miste juridische nuance."
# all_results[0].evaluation = {
#     "output_quality": 3,
#     "consistency": 4,
#     "reasoning_depth": 2,
#     "differences": "Minder gedetailleerd dan commander+subagents variant.",
# }

print("Evaluation cells ready. Update scores above after reviewing traces.")

---
## 6. Export Summary

In [None]:
from dataclasses import asdict

summary = {
    "experiment_settings": EXPERIMENT_SETTINGS,
    "models": MODELS,
    "runs": [asdict(r) for r in all_results],
}

summary_path = Path("results/summary.json")
summary_path.write_text(
    json.dumps(summary, indent=2, ensure_ascii=False, default=str),
    encoding="utf-8",
)

print(f"Summary exported to {summary_path.resolve()}")
print(f"Total runs: {len(all_results)}")