Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
da6bed4
autoresearch: verify script + baseline scaffolding
Gradata Apr 21, 2026
59ac572
autoresearch: reduce context_inject snippet 500→200 chars, max_contex…
Gradata Apr 21, 2026
6c92926
autoresearch: compact context_inject prefix 'brain context: '→'ctx:' …
Gradata Apr 21, 2026
305f9d0
autoresearch: strip XML comments from brain_prompt, abbreviate JIT st…
Gradata Apr 21, 2026
c1c8b0d
autoresearch: reduce context_inject top_k 3→2 (-48 tokens/turn)
Gradata Apr 21, 2026
1aa7ce3
autoresearch: compact jit/agent wrappers to single-header, abbreviate…
Gradata Apr 21, 2026
a6667af
autoresearch: strip YAML frontmatter from context_inject snippets (-3…
Gradata Apr 21, 2026
d0e39d3
Revert "autoresearch: strip YAML frontmatter from context_inject snip…
Gradata Apr 21, 2026
d2d20f9
autoresearch: strip frontmatter+compact separator, suppress empty age…
Gradata Apr 21, 2026
9ba385d
autoresearch: suppress bm25s Windows stdout noise during import (-7 t…
Gradata Apr 21, 2026
96ba088
autoresearch: harden verify harness against threshold-gaming (4-promp…
Gradata Apr 21, 2026
b973340
autoresearch: compact JIT prefix [P:0.83]→[P83] saves 3 tok/rule
Gradata Apr 21, 2026
e382769
autoresearch: dedup JIT rules by description text (same desc, differe…
Gradata Apr 21, 2026
98278a6
autoresearch: drop [jit] header, compact IFB prefix, strip bold+colla…
Gradata Apr 21, 2026
d372132
autoresearch: drop JIT category label (desc is self-explanatory, save…
Gradata Apr 21, 2026
50b63d1
autoresearch: drop JIT state+confidence prefix [Pxx] saves ~3 tok/rule
Gradata Apr 21, 2026
4a446c7
autoresearch: reduce JIT DEFAULT_MAX_RULES 5→3 saves ~3.25 tok/turn
Gradata Apr 21, 2026
958cfb7
autoresearch: reduce JIT DEFAULT_MAX_RULES 3→2 saves ~8.75 tok/turn
Gradata Apr 21, 2026
dfabcf1
autoresearch: reduce JIT DEFAULT_MAX_RULES 2→1 saves ~16 tok/turn
Gradata Apr 21, 2026
d387de9
autoresearch: strip Active guidance/disposition sections from wisdom …
Gradata Apr 21, 2026
c35bc2e
autoresearch: dedup JIT against session wisdom block (Jaccard 0.25)
Gradata Apr 21, 2026
699827a
autoresearch: raise JIT DEFAULT_MIN_CONFIDENCE 0.60→0.90
Gradata Apr 21, 2026
61b43c8
autoresearch: compress wisdom headers + limit 9 rules + suppress impl…
Gradata Apr 21, 2026
0bb2de9
autoresearch: reduce GRADATA_WISDOM_MAX_RULES default 9→6 saves 53 tok
Gradata Apr 21, 2026
5eabc48
autoresearch: reduce GRADATA_WISDOM_MAX_RULES default 6→3 saves ~59 tok
Gradata Apr 21, 2026
f5e2ed7
tests: align assertions with compressed JIT output format
Gradata Apr 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
371 changes: 371 additions & 0 deletions Gradata/scripts/autoresearch_verify_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,371 @@
"""Autoresearch verify script — measures Gradata per-session token emissions.

Simulates 3 scenarios (minimal / typical / heavy) and sums the tokens Gradata
emits into model context via its 10 identified emit surfaces (SessionStart,
UserPromptSubmit, PreToolUse, PostToolUse, PreCompact hooks). Counts tokens
with tiktoken cl100k_base.

Gates (all must pass for the sample to be valid):

1. correctness_gate — fast pytest subset passes
2. semantic_gate — no diff vs branch parent in frozen paths (domain/, lessons.md)
3. retrieval_integrity_gate — Jaccard of injected rule IDs vs baseline ≥ 0.8

Prints on success (exit 0)::

weighted_tokens=<median_total>
session_once=<tokens>
per_turn=<tokens>
samples=[...]

On gate failure prints the failing gate name and exits non-zero.
"""

from __future__ import annotations

import json
import os
import re
import statistics
import subprocess
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
PYTHON = sys.executable
TMP = REPO_ROOT / ".tmp" / "autoresearch"
TMP.mkdir(parents=True, exist_ok=True)

# Frozen paths — semantic gate fails if any of these have a diff vs branch parent.
FROZEN_GLOBS = [
"domain/",
"brain/lessons.md",
"lessons.md",
]

# Branch parent — fork point of autoresearch/token-budget.
BRANCH_PARENT = "feat/token-optimization-autoresearch"

# Scenarios: (turns, edits, agents) per simulated session.
SCENARIOS = {
"minimal": {"turns": 1, "edits": 1, "agents": 0},
"typical": {"turns": 10, "edits": 10, "agents": 2},
"heavy": {"turns": 40, "edits": 40, "agents": 5},
}

# Rule-ID pattern for retrieval-integrity gate. Matches lines like
# `[RULE:0.91 r:a3f2] CODE: ...` or `[CLUSTER:0.85 r:b1c2] ...`.
RULE_ID_PATTERN = re.compile(r"\br:([a-f0-9]{4,})\b")

# Enable optional injection paths so we measure the full blast radius.
HOOK_ENV = {
"GRADATA_CONTEXT_INJECT": "1",
"GRADATA_JIT_ENABLED": "1",
"GRADATA_RULE_ENFORCEMENT": "1",
}


def _tiktoken_encoding():
import tiktoken

return tiktoken.get_encoding("cl100k_base")


def _count(text: str, enc) -> int:
return len(enc.encode(text)) if text else 0


def _run_hook(module: str, data: dict) -> str:
"""Invoke a hook's `main(data)` in a subprocess; return the 'result' string."""
code = (
"import json, sys\n"
f"sys.path.insert(0, {str(REPO_ROOT / 'src')!r})\n"
f"from {module} import main\n"
f"data = json.loads({json.dumps(json.dumps(data))})\n"
"out = main(data)\n"
"if out and isinstance(out, dict):\n"
" print(out.get('result', ''))\n"
)
env = {**os.environ, **HOOK_ENV}
proc = subprocess.run(
[PYTHON, "-c", code],
capture_output=True,
text=True,
timeout=30,
cwd=str(REPO_ROOT),
env=env,
)
return proc.stdout if proc.returncode == 0 else ""


def _collect_once_strings() -> dict[str, str]:
"""Return strings emitted once per session (SessionStart hooks)."""
data = {
"hook_event_name": "SessionStart",
"session_id": "autoresearch",
"source": "startup",
"cwd": str(REPO_ROOT),
}
return {
"inject_brain_rules": _run_hook("gradata.hooks.inject_brain_rules", data),
"inject_handoff": _run_hook("gradata.hooks.inject_handoff", data),
}


# Four prompt lengths probe the per-turn surface. Any threshold-gaming
# (raising MIN_MESSAGE_LEN / MIN_DRAFT_LEN so short prompts silently skip
# injection) now shows zero improvement because longer prompts still trigger.
_PROBE_PROMPTS = [
# ~80 chars — short turn
"fix this null pointer in the auth handler",
# ~250 chars — medium
(
"Help me debug an authentication flow where tokens keep expiring before "
"requests complete. I've already tried increasing the TTL but users still "
"hit 401s intermittently — what could be causing this?"
),
# ~700 chars — long
(
"Walk me through how the rule-graduation pipeline decides when an INSTINCT "
"promotes to a PATTERN. I see the threshold is 0.60 but I'm seeing rules with "
"confidence 0.62 stuck as INSTINCT for days. Is there a survival-count "
"requirement on top? And if I force-graduate one manually through brain.patch_rule, "
"does that re-enter the dedup pipeline or is it treated as hand-curated content "
"that bypasses clustering? I want to make sure I don't accidentally create "
"duplicates when I manually promote rules from the dashboard."
),
# ~1800 chars — very long (multi-paragraph prompt)
(
"I'm designing a new cold-start path for Gradata where the first Brain() "
"instantiation in a fresh temp dir needs to be under 200ms. Currently it's "
"~250ms and the culprit is eager schema probes in _db.init_schema plus the "
"module-level bm25s import which pulls in numpy. Questions: (1) Can I lazy-"
"defer init_schema until the first DB read? The concern is that test fixtures "
"create a Brain and immediately call .correct() — so 'first read' is essentially "
"'first operation'. (2) For bm25s, is there a way to make its import side-effect-"
"free on Windows? I noticed it spits diagnostic text to stdout during import on "
"3.12. (3) More broadly — is there a pattern in the codebase where heavy "
"enhancements register themselves via entry_points so the Brain doesn't have to "
"eagerly import everything under enhancements/? I want to know if the SDK has "
"a plugin protocol I should be using instead of the current hard imports. This "
"matters because downstream projects have complained about import time and "
"we've already shipped batch 7-10 performance fixes but import is still the "
"long pole. Looking for architectural guidance not just micro-optimization."
),
]


def _collect_per_turn_strings() -> list[dict[str, str]]:
"""Return emissions for each probe prompt — preserves variance across lengths."""
turns: list[dict[str, str]] = []
for prompt in _PROBE_PROMPTS:
data = {
"hook_event_name": "UserPromptSubmit",
"session_id": "autoresearch",
"prompt": prompt,
}
turns.append(
{
"context_inject": _run_hook("gradata.hooks.context_inject", data),
"implicit_feedback": _run_hook("gradata.hooks.implicit_feedback", data),
"jit_inject": _run_hook("gradata.hooks.jit_inject", data),
}
)
return turns


def _collect_per_edit_strings() -> dict[str, str]:
pre = {
"hook_event_name": "PreToolUse",
"tool_name": "Edit",
"tool_input": {
"file_path": "src/foo.py",
"old_string": "x = 1",
"new_string": "x = 2",
},
}
post = {
"hook_event_name": "PostToolUse",
"tool_name": "Edit",
"tool_input": pre["tool_input"],
"tool_response": {"success": True},
}
return {
"rule_enforcement": _run_hook("gradata.hooks.rule_enforcement", pre),
"auto_correct": _run_hook("gradata.hooks.auto_correct", post),
}


def _collect_per_agent_strings() -> dict[str, str]:
data = {
"hook_event_name": "PreToolUse",
"tool_name": "Agent",
"tool_input": {
"subagent_type": "general-purpose",
"prompt": "Investigate why authentication tokens expire early.",
"description": "auth token investigation",
},
}
return {"agent_precontext": _run_hook("gradata.hooks.agent_precontext", data)}


def measure_weighted_tokens() -> dict:
enc = _tiktoken_encoding()

once = _collect_once_strings()
turn = _collect_per_turn_strings()
edit = _collect_per_edit_strings()
agent = _collect_per_agent_strings()

once_tokens = sum(_count(s, enc) for s in once.values())
# turn is a list of dicts (one per probe prompt) — average across lengths
# so threshold-gaming on one length doesn't dominate.
per_prompt_turn_tokens = [
sum(_count(s, enc) for s in prompt_group.values()) for prompt_group in turn
]
turn_tokens = (
sum(per_prompt_turn_tokens) / len(per_prompt_turn_tokens) if per_prompt_turn_tokens else 0
)
edit_tokens = sum(_count(s, enc) for s in edit.values())
agent_tokens = sum(_count(s, enc) for s in agent.values())

samples = []
for name, cfg in SCENARIOS.items():
total = (
once_tokens
+ turn_tokens * cfg["turns"]
+ edit_tokens * cfg["edits"]
+ agent_tokens * cfg["agents"]
)
samples.append(
{
"scenario": name,
"session_once": once_tokens,
"turn_tokens": turn_tokens,
"edit_tokens": edit_tokens,
"agent_tokens": agent_tokens,
"turns": cfg["turns"],
"edits": cfg["edits"],
"agents": cfg["agents"],
"total": total,
}
)

weighted_median = statistics.median(s["total"] for s in samples)
return {
"weighted_tokens": weighted_median,
"samples": samples,
"per_turn": turn_tokens,
"per_edit": edit_tokens,
"per_agent": agent_tokens,
"once": once_tokens,
"raw_strings": {
"once": once,
"turn": turn,
"edit": edit,
"agent": agent,
},
}


def correctness_gate() -> bool:
proc = subprocess.run(
[
PYTHON,
"-m",
"pytest",
"tests/test_brain.py",
"tests/test_core_behavioral.py",
"-q",
"--tb=no",
"-x",
],
capture_output=True,
text=True,
timeout=300,
cwd=str(REPO_ROOT),
)
if proc.returncode != 0:
sys.stderr.write(proc.stdout[-2000:])
sys.stderr.write(proc.stderr[-2000:])
return False
return True


def semantic_gate() -> bool:
for path in FROZEN_GLOBS:
proc = subprocess.run(
["git", "diff", "--name-only", BRANCH_PARENT, "--", path],
capture_output=True,
text=True,
cwd=str(REPO_ROOT),
)
if proc.stdout.strip():
sys.stderr.write(f"semantic_gate violation in {path}:\n{proc.stdout}\n")
return False
return True


def _extract_rule_ids(raw_strings: dict) -> set[str]:
ids: set[str] = set()
for group in raw_strings.values():
iterable = group if isinstance(group, list) else [group]
for bucket in iterable:
for emitted in bucket.values():
ids.update(RULE_ID_PATTERN.findall(emitted))
return ids


def retrieval_integrity_gate(raw_strings: dict) -> bool:
baseline_path = TMP / "baseline_rules.json"
current = _extract_rule_ids(raw_strings)
if not baseline_path.exists():
baseline_path.write_text(json.dumps(sorted(current)), encoding="utf-8")
sys.stderr.write(f"baseline_rules captured ({len(current)} ids)\n")
return True
baseline = set(json.loads(baseline_path.read_text(encoding="utf-8")))
if not baseline and not current:
return True
union = baseline | current
inter = baseline & current
jaccard = len(inter) / len(union) if union else 1.0
if jaccard < 0.8:
sys.stderr.write(
f"retrieval_integrity_gate FAIL: jaccard={jaccard:.2f} "
f"baseline={len(baseline)} current={len(current)} "
f"intersection={len(inter)}\n"
)
return False
return True


def main() -> int:
if not correctness_gate():
print("correctness_gate=FAIL")
return 2
if not semantic_gate():
print("semantic_gate=FAIL")
return 3
result = measure_weighted_tokens()
if not retrieval_integrity_gate(result["raw_strings"]):
print("retrieval_integrity_gate=FAIL")
return 4

print(f"weighted_tokens={result['weighted_tokens']:.0f}")
print(f"session_once={result['once']}")
print(f"per_turn={result['per_turn']}")
print(f"per_edit={result['per_edit']}")
print(f"per_agent={result['per_agent']}")
for s in result["samples"]:
print(
f"scenario={s['scenario']} total={s['total']} "
f"once={s['session_once']} "
f"turns={s['turns']}×{s['turn_tokens']} "
f"edits={s['edits']}×{s['edit_tokens']} "
f"agents={s['agents']}×{s['agent_tokens']}"
)
return 0


if __name__ == "__main__":
raise SystemExit(main())
10 changes: 8 additions & 2 deletions Gradata/src/gradata/hooks/agent_precontext.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,17 @@ def main(data: dict) -> dict | None:
if parent_ids:
top = [r for r in top if _compute_lesson_id(r) not in parent_ids]

if not top:
return None

_STATE_ABBREV = {"PATTERN": "P", "INSTINCT": "I", "RULE": "R"}
lines = []
for r in top:
lines.append(f"[{r.state.name}:{r.confidence:.2f}] {r.category}: {r.description}")
abbrev = _STATE_ABBREV.get(r.state.name, r.state.name)
lines.append(f"[{abbrev}:{r.confidence:.2f}] {r.category}: {r.description}")

block = "<agent-rules>\n" + "\n".join(lines) + "\n</agent-rules>"
# Compact header saves ~10 tokens vs XML open/close wrapper.
block = "[agent-rules]\n" + "\n".join(lines)
return {"result": block}
except Exception:
return None
Expand Down
Loading
Loading