From e33b4335040dac697d35051f9c097a795b49de23 Mon Sep 17 00:00:00 2001 From: JeremyDev87 Date: Sat, 21 Mar 2026 22:53:17 +0900 Subject: [PATCH] feat(skills): add skill-creator scripts for benchmarking, optimization, and scaffolding (#742) - aggregate_benchmark.py: aggregates grading.json + timing.json into benchmark.json/md - run_loop.py: interactive description optimization loop with 60/40 train/test split - init_skill.sh: skill directory scaffolding with SKILL.md template Closes #742 --- .../scripts/aggregate_benchmark.py | 302 ++++++++++++++++ .../skill-creator/scripts/init_skill.sh | 196 +++++++++++ .../skills/skill-creator/scripts/run_loop.py | 327 ++++++++++++++++++ 3 files changed, 825 insertions(+) create mode 100644 packages/rules/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py create mode 100755 packages/rules/.ai-rules/skills/skill-creator/scripts/init_skill.sh create mode 100644 packages/rules/.ai-rules/skills/skill-creator/scripts/run_loop.py diff --git a/packages/rules/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py b/packages/rules/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py new file mode 100644 index 00000000..4c762ddc --- /dev/null +++ b/packages/rules/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py @@ -0,0 +1,302 @@ +#!/usr/bin/env python3 +"""Aggregate benchmark results from an iteration directory. + +Reads grading.json and timing.json from each eval-N/{with_skill,without_skill}/ +subdirectory and produces benchmark.json + benchmark.md in the iteration directory. + +Usage: + python aggregate_benchmark.py --skill-name + +Example: + python aggregate_benchmark.py workspace/iteration-1 --skill-name test-driven-development + +Requirements: + Python 3.8+ (standard library only) +""" + +from __future__ import annotations + +import argparse +import json +import math +import sys +from pathlib import Path +from typing import Any, Dict, List, Optional + + +def parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Aggregate benchmark results from a skill evaluation iteration.", + epilog=( + "Example:\n" + " python aggregate_benchmark.py workspace/iteration-1 " + "--skill-name test-driven-development" + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "iteration_dir", + type=str, + help="Path to the iteration directory (e.g. workspace/iteration-1)", + ) + parser.add_argument( + "--skill-name", + required=True, + help="Skill name in kebab-case (e.g. test-driven-development)", + ) + return parser.parse_args(argv) + + +def _load_json(path: Path) -> Optional[Dict[str, Any]]: + """Load a JSON file, returning None on failure.""" + try: + with open(path, encoding="utf-8") as f: + return json.load(f) + except FileNotFoundError: + print(f"[WARN] File not found, skipping: {path}", file=sys.stderr) + return None + except json.JSONDecodeError as exc: + print(f"[WARN] Invalid JSON in {path}: {exc}", file=sys.stderr) + return None + + +def _pass_rate(grading: Dict[str, Any]) -> float: + """Calculate pass rate from a grading.json structure.""" + expectations = grading.get("expectations", []) + if not expectations: + return 0.0 + passed = sum(1 for e in expectations if e.get("passed", False)) + return passed / len(expectations) + + +def _mean(values: List[float]) -> float: + if not values: + return 0.0 + return sum(values) / len(values) + + +def _stddev(values: List[float]) -> float: + if len(values) < 2: + return 0.0 + m = _mean(values) + variance = sum((x - m) ** 2 for x in values) / len(values) + return math.sqrt(variance) + + +def _extract_iteration_number(iteration_dir: Path) -> int: + """Extract iteration number from directory name like 'iteration-3'.""" + name = iteration_dir.name + if name.startswith("iteration-"): + try: + return int(name.split("-", 1)[1]) + except ValueError: + pass + return 1 + + +def _discover_evals(iteration_dir: Path) -> List[int]: + """Discover eval-N directories and return sorted eval IDs.""" + eval_ids: List[int] = [] + if not iteration_dir.is_dir(): + return eval_ids + for entry in iteration_dir.iterdir(): + if entry.is_dir() and entry.name.startswith("eval-"): + try: + eval_id = int(entry.name.split("-", 1)[1]) + eval_ids.append(eval_id) + except ValueError: + continue + return sorted(eval_ids) + + +def _collect_eval_result( + eval_dir: Path, eval_id: int +) -> Optional[Dict[str, Any]]: + """Collect with_skill vs baseline results for a single eval.""" + with_skill_dir = eval_dir / "with_skill" + without_skill_dir = eval_dir / "without_skill" + + ws_grading = _load_json(with_skill_dir / "grading.json") + ws_timing = _load_json(with_skill_dir / "timing.json") + bl_grading = _load_json(without_skill_dir / "grading.json") + bl_timing = _load_json(without_skill_dir / "timing.json") + + if ws_grading is None or ws_timing is None: + print( + f"[WARN] Incomplete with_skill data for eval-{eval_id}, skipping.", + file=sys.stderr, + ) + return None + + if bl_grading is None or bl_timing is None: + print( + f"[WARN] Incomplete baseline data for eval-{eval_id}, skipping.", + file=sys.stderr, + ) + return None + + return { + "eval_id": eval_id, + "with_skill": { + "pass_rate": round(_pass_rate(ws_grading), 4), + "tokens": ws_timing.get("total_tokens", 0), + "duration": ws_timing.get("total_duration_seconds", 0.0), + }, + "baseline": { + "pass_rate": round(_pass_rate(bl_grading), 4), + "tokens": bl_timing.get("total_tokens", 0), + "duration": bl_timing.get("total_duration_seconds", 0.0), + }, + } + + +def _build_summary( + eval_results: List[Dict[str, Any]], +) -> Dict[str, Dict[str, float]]: + """Build summary statistics from with_skill results.""" + pass_rates = [r["with_skill"]["pass_rate"] for r in eval_results] + tokens = [float(r["with_skill"]["tokens"]) for r in eval_results] + durations = [r["with_skill"]["duration"] for r in eval_results] + + return { + "pass_rate": { + "mean": round(_mean(pass_rates), 4), + "stddev": round(_stddev(pass_rates), 4), + }, + "tokens": { + "mean": round(_mean(tokens), 2), + "stddev": round(_stddev(tokens), 2), + }, + "duration_seconds": { + "mean": round(_mean(durations), 2), + "stddev": round(_stddev(durations), 2), + }, + } + + +def _generate_markdown(benchmark: Dict[str, Any]) -> str: + """Generate a human-readable markdown report from benchmark data.""" + lines: List[str] = [] + skill = benchmark["skill_name"] + iteration = benchmark["iteration"] + summary = benchmark["summary"] + + lines.append(f"# Benchmark Report: {skill}") + lines.append(f"\n**Iteration:** {iteration}") + lines.append("") + + lines.append("## Summary") + lines.append("") + lines.append("| Metric | Mean | Std Dev |") + lines.append("|--------|------|---------|") + lines.append( + f"| Pass Rate | {summary['pass_rate']['mean']:.2%} " + f"| {summary['pass_rate']['stddev']:.4f} |" + ) + lines.append( + f"| Tokens | {summary['tokens']['mean']:.0f} " + f"| {summary['tokens']['stddev']:.0f} |" + ) + lines.append( + f"| Duration (s) | {summary['duration_seconds']['mean']:.2f} " + f"| {summary['duration_seconds']['stddev']:.2f} |" + ) + lines.append("") + + lines.append("## Eval Results") + lines.append("") + lines.append( + "| Eval | With Skill (pass) | Baseline (pass) | " + "With Skill (tokens) | Baseline (tokens) | " + "With Skill (dur) | Baseline (dur) |" + ) + lines.append( + "|------|-------------------|-----------------|" + "--------------------|-------------------|" + "-----------------|----------------|" + ) + + for r in benchmark["eval_results"]: + ws = r["with_skill"] + bl = r["baseline"] + lines.append( + f"| eval-{r['eval_id']} " + f"| {ws['pass_rate']:.2%} | {bl['pass_rate']:.2%} " + f"| {ws['tokens']} | {bl['tokens']} " + f"| {ws['duration']:.2f}s | {bl['duration']:.2f}s |" + ) + + lines.append("") + return "\n".join(lines) + + +def main(argv: Optional[List[str]] = None) -> int: + args = parse_args(argv) + iteration_dir = Path(args.iteration_dir).resolve() + + if not iteration_dir.is_dir(): + print(f"[ERROR] Not a directory: {iteration_dir}", file=sys.stderr) + return 1 + + eval_ids = _discover_evals(iteration_dir) + if not eval_ids: + print( + f"[ERROR] No eval-N directories found in {iteration_dir}", + file=sys.stderr, + ) + return 1 + + eval_results: List[Dict[str, Any]] = [] + for eval_id in eval_ids: + eval_dir = iteration_dir / f"eval-{eval_id}" + result = _collect_eval_result(eval_dir, eval_id) + if result is not None: + eval_results.append(result) + + if not eval_results: + print( + "[ERROR] No complete eval results found. " + "Check warnings above for details.", + file=sys.stderr, + ) + return 1 + + iteration_number = _extract_iteration_number(iteration_dir) + + benchmark: Dict[str, Any] = { + "skill_name": args.skill_name, + "iteration": iteration_number, + "summary": _build_summary(eval_results), + "eval_results": eval_results, + } + + # Write benchmark.json + json_path = iteration_dir / "benchmark.json" + with open(json_path, "w", encoding="utf-8") as f: + json.dump(benchmark, f, indent=2, ensure_ascii=False) + print(f"[OK] Written: {json_path}") + + # Write benchmark.md + md_path = iteration_dir / "benchmark.md" + with open(md_path, "w", encoding="utf-8") as f: + f.write(_generate_markdown(benchmark)) + print(f"[OK] Written: {md_path}") + + # Print summary + s = benchmark["summary"] + print( + f"\n--- Iteration {iteration_number} Summary ---\n" + f" Evals collected: {len(eval_results)}\n" + f" Pass rate: {s['pass_rate']['mean']:.2%} " + f"(stddev {s['pass_rate']['stddev']:.4f})\n" + f" Tokens: {s['tokens']['mean']:.0f} " + f"(stddev {s['tokens']['stddev']:.0f})\n" + f" Duration: {s['duration_seconds']['mean']:.2f}s " + f"(stddev {s['duration_seconds']['stddev']:.2f}s)" + ) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/packages/rules/.ai-rules/skills/skill-creator/scripts/init_skill.sh b/packages/rules/.ai-rules/skills/skill-creator/scripts/init_skill.sh new file mode 100755 index 00000000..3d027778 --- /dev/null +++ b/packages/rules/.ai-rules/skills/skill-creator/scripts/init_skill.sh @@ -0,0 +1,196 @@ +#!/usr/bin/env bash +# init_skill.sh — Scaffold a new skill directory with SKILL.md template. +# +# Usage: +# ./init_skill.sh [--path ] [--help] +# +# Example: +# ./init_skill.sh my-awesome-skill +# ./init_skill.sh my-awesome-skill --path /custom/skills/dir +# +# Creates: +# // +# ├── SKILL.md # Skill template with frontmatter +# └── references/ # Supporting files directory + +set -euo pipefail + +VERSION="1.0.0" + +# ────────────────────────────────────────────── +# Help +# ────────────────────────────────────────────── + +usage() { + cat <<'HELP' +init_skill.sh — Scaffold a new skill directory + +USAGE + ./init_skill.sh [OPTIONS] + +ARGUMENTS + skill-name Skill name in kebab-case (e.g. my-awesome-skill) + Must start with a letter, contain only lowercase + letters, digits, and hyphens. + +OPTIONS + --path Parent directory for the skill (default: current directory) + --help Show this help message + --version Show version + +EXAMPLES + ./init_skill.sh test-driven-development + ./init_skill.sh my-skill --path ./skills + ./init_skill.sh code-review --path /absolute/path/to/skills + +OUTPUT + Creates the following structure: + + / + ├── SKILL.md Skill definition with frontmatter template + └── references/ Directory for supporting files +HELP +} + +# ────────────────────────────────────────────── +# Argument parsing +# ────────────────────────────────────────────── + +SKILL_NAME="" +PARENT_DIR="." + +while [[ $# -gt 0 ]]; do + case "$1" in + --help|-h) + usage + exit 0 + ;; + --version|-v) + echo "init_skill.sh v${VERSION}" + exit 0 + ;; + --path) + if [[ -z "${2:-}" ]]; then + echo "[ERROR] --path requires a directory argument" >&2 + exit 1 + fi + PARENT_DIR="$2" + shift 2 + ;; + -*) + echo "[ERROR] Unknown option: $1" >&2 + echo "Run './init_skill.sh --help' for usage." >&2 + exit 1 + ;; + *) + if [[ -n "$SKILL_NAME" ]]; then + echo "[ERROR] Unexpected argument: $1" >&2 + echo "Only one skill name is allowed." >&2 + exit 1 + fi + SKILL_NAME="$1" + shift + ;; + esac +done + +# ────────────────────────────────────────────── +# Validation +# ────────────────────────────────────────────── + +if [[ -z "$SKILL_NAME" ]]; then + echo "[ERROR] Skill name is required." >&2 + echo "Run './init_skill.sh --help' for usage." >&2 + exit 1 +fi + +# Validate kebab-case: starts with letter, lowercase + digits + hyphens only +if ! echo "$SKILL_NAME" | grep -qE '^[a-z][a-z0-9-]*$'; then + echo "[ERROR] Invalid skill name: '$SKILL_NAME'" >&2 + echo "Must be kebab-case: start with a letter, contain only lowercase letters, digits, and hyphens." >&2 + exit 1 +fi + +if [[ ! -d "$PARENT_DIR" ]]; then + echo "[ERROR] Parent directory does not exist: $PARENT_DIR" >&2 + exit 1 +fi + +SKILL_DIR="${PARENT_DIR}/${SKILL_NAME}" + +# Prevent overwriting existing directory +if [[ -d "$SKILL_DIR" ]]; then + echo "[ERROR] Directory already exists: $SKILL_DIR" >&2 + echo "Remove it first or choose a different name." >&2 + exit 1 +fi + +# ────────────────────────────────────────────── +# Scaffold +# ────────────────────────────────────────────── + +mkdir -p "${SKILL_DIR}/references" "${SKILL_DIR}/examples" "${SKILL_DIR}/scripts" + +cat > "${SKILL_DIR}/SKILL.md" <