Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,12 @@ resolved relative to that fixture file and may not escape that root:
cargo run --release -- --eval ./evals/local/external-readme-check.json --json
```

The interactive agent eval command accepts the same fixture path:

```text
/eval agent ./evals/local/external-readme-check.json
```

## A Good First Session

Here is a simple sequence that exercises the whole product:
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -836,10 +836,12 @@ runtime.
- **One-shot mode** — `small-harness --print "summarize this repo"` or
`printf '…\n' | small-harness` for scripts and CI. Approval-gated tools
are denied by default; pass `--allow-tools` to allow them.
- **Agent eval CLI** — `small-harness --eval fix-failing-test [--model M] [--json]`
- **Agent eval** — `small-harness --eval fix-failing-test [--model M] [--json]`
runs a bundled eval fixture and exits 0/1 (for CI scripts). `--eval` can
also point at a data-only fixture JSON file; its workspace is resolved
relative to that file and rejected if it escapes the fixture root.
relative to that file and rejected if it escapes the fixture root. In the
interactive TUI, `/eval agent <fixture.json>` accepts the same external
fixture path.
- **Warmup.** Small Harness sends a 1-token request with the full system
prompt + tools at startup so llama.cpp-derived engines have a hot
prompt-eval cache before your first prompt. Disable with `WARMUP=false`.
Expand Down
59 changes: 54 additions & 5 deletions src/commands/workflow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ async fn cmd_eval_agent(parts: &[&str], state: &AppState) -> Result<()> {
};
(vec![fixture], models)
};
let fixtures = resolve_eval_agent_fixtures(&fixture_ids)?;

let eval_dir = Path::new(&state.session_dir).join("evals");
fs::create_dir_all(&eval_dir)?;
Expand All @@ -262,16 +263,14 @@ async fn cmd_eval_agent(parts: &[&str], state: &AppState) -> Result<()> {
for spec in model_specs {
let (backend_desc, model) = parse_eval_model(&spec, state);
validate(&backend_desc)?;
for fixture_id in &fixture_ids {
let fixture = crate::agent_eval::fixture_by_id(fixture_id)
.ok_or_else(|| anyhow!("unknown agent eval fixture: {fixture_id}"))?;
for fixture in &fixtures {
println!(
" {DIM}agent eval {} · {} · {}{RESET}",
fixture_id,
fixture.id,
backend_desc.name.as_str(),
model
);
let result = run_agent_eval(&state.config, &backend_desc, &model, &fixture).await?;
let result = run_agent_eval(&state.config, &backend_desc, &model, fixture).await?;
println!(
" {} {}{RESET}",
if result.passed {
Expand All @@ -297,6 +296,15 @@ async fn cmd_eval_agent(parts: &[&str], state: &AppState) -> Result<()> {
Ok(())
}

fn resolve_eval_agent_fixtures(
fixture_specs: &[String],
) -> Result<Vec<crate::agent_eval::AgentEvalFixture>> {
fixture_specs
.iter()
.map(|spec| crate::agent_eval::fixture_by_spec(spec))
.collect()
}

pub(super) fn cmd_batch(args: &str, state: &AppState) -> Result<()> {
let parts: Vec<&str> = args.split_whitespace().collect();
if parts.is_empty() {
Expand Down Expand Up @@ -870,3 +878,44 @@ fn truncate_string(s: &str, max_len: usize) -> String {
format!("{}...", &s[..max_len.saturating_sub(3)])
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn eval_agent_resolves_external_fixture_specs() {
let dir = tempfile::tempdir().unwrap();
fs::create_dir_all(dir.path().join("workspace/basic/src")).unwrap();
fs::write(
dir.path().join("workspace/basic/Cargo.toml"),
"[package]\nname=\"x\"\nversion=\"0.1.0\"\nedition=\"2021\"\n",
)
.unwrap();
fs::write(
dir.path().join("workspace/basic/src/lib.rs"),
"pub fn add() {}\n",
)
.unwrap();
let fixture_path = dir.path().join("external.json");
fs::write(
&fixture_path,
r#"{
"id": "external-basic",
"prompt": "Read the library.",
"workspace": "workspace/basic",
"checks": [
{ "type": "fileContains", "path": "src/lib.rs", "needle": "add" }
]
}"#,
)
.unwrap();
let spec = fixture_path.to_str().unwrap();

let fixtures = resolve_eval_agent_fixtures(&[spec.to_string()]).unwrap();

assert_eq!(fixtures.len(), 1);
assert_eq!(fixtures[0].id, "external-basic");
assert!(fixtures[0].fixture_root.is_some());
}
}
Loading