From e33b4335040dac697d35051f9c097a795b49de23 Mon Sep 17 00:00:00 2001
From: JeremyDev87 <soundbrokaz@kakao.com>
Date: Sat, 21 Mar 2026 22:53:17 +0900
Subject: [PATCH] feat(skills): add skill-creator scripts for benchmarking,
 optimization, and scaffolding (#742)

- aggregate_benchmark.py: aggregates grading.json + timing.json into benchmark.json/md
- run_loop.py: interactive description optimization loop with 60/40 train/test split
- init_skill.sh: skill directory scaffolding with SKILL.md template

Closes #742
---
 .../scripts/aggregate_benchmark.py            | 302 ++++++++++++++++
 .../skill-creator/scripts/init_skill.sh       | 196 +++++++++++
 .../skills/skill-creator/scripts/run_loop.py  | 327 ++++++++++++++++++
 3 files changed, 825 insertions(+)
 create mode 100644 packages/rules/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py
 create mode 100755 packages/rules/.ai-rules/skills/skill-creator/scripts/init_skill.sh
 create mode 100644 packages/rules/.ai-rules/skills/skill-creator/scripts/run_loop.py
diff --git a/packages/rules/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py b/packages/rules/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py
new file mode 100644
index 00000000..4c762ddc
--- /dev/null
+++ b/packages/rules/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+"""Aggregate benchmark results from an iteration directory.
+
+Reads grading.json and timing.json from each eval-N/{with_skill,without_skill}/
+subdirectory and produces benchmark.json + benchmark.md in the iteration directory.
+
+Usage:
+    python aggregate_benchmark.py <iteration-dir> --skill-name <name>
+
+Example:
+    python aggregate_benchmark.py workspace/iteration-1 --skill-name test-driven-development
+
+Requirements:
+    Python 3.8+ (standard library only)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+
+def parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Aggregate benchmark results from a skill evaluation iteration.",
+        epilog=(
+            "Example:\n"
+            "  python aggregate_benchmark.py workspace/iteration-1 "
+            "--skill-name test-driven-development"
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "iteration_dir",
+        type=str,
+        help="Path to the iteration directory (e.g. workspace/iteration-1)",
+    )
+    parser.add_argument(
+        "--skill-name",
+        required=True,
+        help="Skill name in kebab-case (e.g. test-driven-development)",
+    )
+    return parser.parse_args(argv)
+
+
+def _load_json(path: Path) -> Optional[Dict[str, Any]]:
+    """Load a JSON file, returning None on failure."""
+    try:
+        with open(path, encoding="utf-8") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"[WARN] File not found, skipping: {path}", file=sys.stderr)
+        return None
+    except json.JSONDecodeError as exc:
+        print(f"[WARN] Invalid JSON in {path}: {exc}", file=sys.stderr)
+        return None
+
+
+def _pass_rate(grading: Dict[str, Any]) -> float:
+    """Calculate pass rate from a grading.json structure."""
+    expectations = grading.get("expectations", [])
+    if not expectations:
+        return 0.0
+    passed = sum(1 for e in expectations if e.get("passed", False))
+    return passed / len(expectations)
+
+
+def _mean(values: List[float]) -> float:
+    if not values:
+        return 0.0
+    return sum(values) / len(values)
+
+
+def _stddev(values: List[float]) -> float:
+    if len(values) < 2:
+        return 0.0
+    m = _mean(values)
+    variance = sum((x - m) ** 2 for x in values) / len(values)
+    return math.sqrt(variance)
+
+
+def _extract_iteration_number(iteration_dir: Path) -> int:
+    """Extract iteration number from directory name like 'iteration-3'."""
+    name = iteration_dir.name
+    if name.startswith("iteration-"):
+        try:
+            return int(name.split("-", 1)[1])
+        except ValueError:
+            pass
+    return 1
+
+
+def _discover_evals(iteration_dir: Path) -> List[int]:
+    """Discover eval-N directories and return sorted eval IDs."""
+    eval_ids: List[int] = []
+    if not iteration_dir.is_dir():
+        return eval_ids
+    for entry in iteration_dir.iterdir():
+        if entry.is_dir() and entry.name.startswith("eval-"):
+            try:
+                eval_id = int(entry.name.split("-", 1)[1])
+                eval_ids.append(eval_id)
+            except ValueError:
+                continue
+    return sorted(eval_ids)
+
+
+def _collect_eval_result(
+    eval_dir: Path, eval_id: int
+) -> Optional[Dict[str, Any]]:
+    """Collect with_skill vs baseline results for a single eval."""
+    with_skill_dir = eval_dir / "with_skill"
+    without_skill_dir = eval_dir / "without_skill"
+
+    ws_grading = _load_json(with_skill_dir / "grading.json")
+    ws_timing = _load_json(with_skill_dir / "timing.json")
+    bl_grading = _load_json(without_skill_dir / "grading.json")
+    bl_timing = _load_json(without_skill_dir / "timing.json")
+
+    if ws_grading is None or ws_timing is None:
+        print(
+            f"[WARN] Incomplete with_skill data for eval-{eval_id}, skipping.",
+            file=sys.stderr,
+        )
+        return None
+
+    if bl_grading is None or bl_timing is None:
+        print(
+            f"[WARN] Incomplete baseline data for eval-{eval_id}, skipping.",
+            file=sys.stderr,
+        )
+        return None
+
+    return {
+        "eval_id": eval_id,
+        "with_skill": {
+            "pass_rate": round(_pass_rate(ws_grading), 4),
+            "tokens": ws_timing.get("total_tokens", 0),
+            "duration": ws_timing.get("total_duration_seconds", 0.0),
+        },
+        "baseline": {
+            "pass_rate": round(_pass_rate(bl_grading), 4),
+            "tokens": bl_timing.get("total_tokens", 0),
+            "duration": bl_timing.get("total_duration_seconds", 0.0),
+        },
+    }
+
+
+def _build_summary(
+    eval_results: List[Dict[str, Any]],
+) -> Dict[str, Dict[str, float]]:
+    """Build summary statistics from with_skill results."""
+    pass_rates = [r["with_skill"]["pass_rate"] for r in eval_results]
+    tokens = [float(r["with_skill"]["tokens"]) for r in eval_results]
+    durations = [r["with_skill"]["duration"] for r in eval_results]
+
+    return {
+        "pass_rate": {
+            "mean": round(_mean(pass_rates), 4),
+            "stddev": round(_stddev(pass_rates), 4),
+        },
+        "tokens": {
+            "mean": round(_mean(tokens), 2),
+            "stddev": round(_stddev(tokens), 2),
+        },
+        "duration_seconds": {
+            "mean": round(_mean(durations), 2),
+            "stddev": round(_stddev(durations), 2),
+        },
+    }
+
+
+def _generate_markdown(benchmark: Dict[str, Any]) -> str:
+    """Generate a human-readable markdown report from benchmark data."""
+    lines: List[str] = []
+    skill = benchmark["skill_name"]
+    iteration = benchmark["iteration"]
+    summary = benchmark["summary"]
+
+    lines.append(f"# Benchmark Report: {skill}")
+    lines.append(f"\n**Iteration:** {iteration}")
+    lines.append("")
+
+    lines.append("## Summary")
+    lines.append("")
+    lines.append("| Metric | Mean | Std Dev |")
+    lines.append("|--------|------|---------|")
+    lines.append(
+        f"| Pass Rate | {summary['pass_rate']['mean']:.2%} "
+        f"| {summary['pass_rate']['stddev']:.4f} |"
+    )
+    lines.append(
+        f"| Tokens | {summary['tokens']['mean']:.0f} "
+        f"| {summary['tokens']['stddev']:.0f} |"
+    )
+    lines.append(
+        f"| Duration (s) | {summary['duration_seconds']['mean']:.2f} "
+        f"| {summary['duration_seconds']['stddev']:.2f} |"
+    )
+    lines.append("")
+
+    lines.append("## Eval Results")
+    lines.append("")
+    lines.append(
+        "| Eval | With Skill (pass) | Baseline (pass) | "
+        "With Skill (tokens) | Baseline (tokens) | "
+        "With Skill (dur) | Baseline (dur) |"
+    )
+    lines.append(
+        "|------|-------------------|-----------------|"
+        "--------------------|-------------------|"
+        "-----------------|----------------|"
+    )
+
+    for r in benchmark["eval_results"]:
+        ws = r["with_skill"]
+        bl = r["baseline"]
+        lines.append(
+            f"| eval-{r['eval_id']} "
+            f"| {ws['pass_rate']:.2%} | {bl['pass_rate']:.2%} "
+            f"| {ws['tokens']} | {bl['tokens']} "
+            f"| {ws['duration']:.2f}s | {bl['duration']:.2f}s |"
+        )
+
+    lines.append("")
+    return "\n".join(lines)
+
+
+def main(argv: Optional[List[str]] = None) -> int:
+    args = parse_args(argv)
+    iteration_dir = Path(args.iteration_dir).resolve()
+
+    if not iteration_dir.is_dir():
+        print(f"[ERROR] Not a directory: {iteration_dir}", file=sys.stderr)
+        return 1
+
+    eval_ids = _discover_evals(iteration_dir)
+    if not eval_ids:
+        print(
+            f"[ERROR] No eval-N directories found in {iteration_dir}",
+            file=sys.stderr,
+        )
+        return 1
+
+    eval_results: List[Dict[str, Any]] = []
+    for eval_id in eval_ids:
+        eval_dir = iteration_dir / f"eval-{eval_id}"
+        result = _collect_eval_result(eval_dir, eval_id)
+        if result is not None:
+            eval_results.append(result)
+
+    if not eval_results:
+        print(
+            "[ERROR] No complete eval results found. "
+            "Check warnings above for details.",
+            file=sys.stderr,
+        )
+        return 1
+
+    iteration_number = _extract_iteration_number(iteration_dir)
+
+    benchmark: Dict[str, Any] = {
+        "skill_name": args.skill_name,
+        "iteration": iteration_number,
+        "summary": _build_summary(eval_results),
+        "eval_results": eval_results,
+    }
+
+    # Write benchmark.json
+    json_path = iteration_dir / "benchmark.json"
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(benchmark, f, indent=2, ensure_ascii=False)
+    print(f"[OK] Written: {json_path}")
+
+    # Write benchmark.md
+    md_path = iteration_dir / "benchmark.md"
+    with open(md_path, "w", encoding="utf-8") as f:
+        f.write(_generate_markdown(benchmark))
+    print(f"[OK] Written: {md_path}")
+
+    # Print summary
+    s = benchmark["summary"]
+    print(
+        f"\n--- Iteration {iteration_number} Summary ---\n"
+        f"  Evals collected: {len(eval_results)}\n"
+        f"  Pass rate:  {s['pass_rate']['mean']:.2%} "
+        f"(stddev {s['pass_rate']['stddev']:.4f})\n"
+        f"  Tokens:     {s['tokens']['mean']:.0f} "
+        f"(stddev {s['tokens']['stddev']:.0f})\n"
+        f"  Duration:   {s['duration_seconds']['mean']:.2f}s "
+        f"(stddev {s['duration_seconds']['stddev']:.2f}s)"
+    )
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/packages/rules/.ai-rules/skills/skill-creator/scripts/init_skill.sh b/packages/rules/.ai-rules/skills/skill-creator/scripts/init_skill.sh
new file mode 100755
index 00000000..3d027778
--- /dev/null
+++ b/packages/rules/.ai-rules/skills/skill-creator/scripts/init_skill.sh
@@ -0,0 +1,196 @@
+#!/usr/bin/env bash
+# init_skill.sh — Scaffold a new skill directory with SKILL.md template.
+#
+# Usage:
+#   ./init_skill.sh <skill-name> [--path <dir>] [--help]
+#
+# Example:
+#   ./init_skill.sh my-awesome-skill
+#   ./init_skill.sh my-awesome-skill --path /custom/skills/dir
+#
+# Creates:
+#   <dir>/<skill-name>/
+#   ├── SKILL.md          # Skill template with frontmatter
+#   └── references/       # Supporting files directory
+
+set -euo pipefail
+
+VERSION="1.0.0"
+
+# ──────────────────────────────────────────────
+# Help
+# ──────────────────────────────────────────────
+
+usage() {
+    cat <<'HELP'
+init_skill.sh — Scaffold a new skill directory
+
+USAGE
+    ./init_skill.sh <skill-name> [OPTIONS]
+
+ARGUMENTS
+    skill-name    Skill name in kebab-case (e.g. my-awesome-skill)
+                  Must start with a letter, contain only lowercase
+                  letters, digits, and hyphens.
+
+OPTIONS
+    --path <dir>  Parent directory for the skill (default: current directory)
+    --help        Show this help message
+    --version     Show version
+
+EXAMPLES
+    ./init_skill.sh test-driven-development
+    ./init_skill.sh my-skill --path ./skills
+    ./init_skill.sh code-review --path /absolute/path/to/skills
+
+OUTPUT
+    Creates the following structure:
+
+    <skill-name>/
+    ├── SKILL.md          Skill definition with frontmatter template
+    └── references/       Directory for supporting files
+HELP
+}
+
+# ──────────────────────────────────────────────
+# Argument parsing
+# ──────────────────────────────────────────────
+
+SKILL_NAME=""
+PARENT_DIR="."
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --help|-h)
+            usage
+            exit 0
+            ;;
+        --version|-v)
+            echo "init_skill.sh v${VERSION}"
+            exit 0
+            ;;
+        --path)
+            if [[ -z "${2:-}" ]]; then
+                echo "[ERROR] --path requires a directory argument" >&2
+                exit 1
+            fi
+            PARENT_DIR="$2"
+            shift 2
+            ;;
+        -*)
+            echo "[ERROR] Unknown option: $1" >&2
+            echo "Run './init_skill.sh --help' for usage." >&2
+            exit 1
+            ;;
+        *)
+            if [[ -n "$SKILL_NAME" ]]; then
+                echo "[ERROR] Unexpected argument: $1" >&2
+                echo "Only one skill name is allowed." >&2
+                exit 1
+            fi
+            SKILL_NAME="$1"
+            shift
+            ;;
+    esac
+done
+
+# ──────────────────────────────────────────────
+# Validation
+# ──────────────────────────────────────────────
+
+if [[ -z "$SKILL_NAME" ]]; then
+    echo "[ERROR] Skill name is required." >&2
+    echo "Run './init_skill.sh --help' for usage." >&2
+    exit 1
+fi
+
+# Validate kebab-case: starts with letter, lowercase + digits + hyphens only
+if ! echo "$SKILL_NAME" | grep -qE '^[a-z][a-z0-9-]*$'; then
+    echo "[ERROR] Invalid skill name: '$SKILL_NAME'" >&2
+    echo "Must be kebab-case: start with a letter, contain only lowercase letters, digits, and hyphens." >&2
+    exit 1
+fi
+
+if [[ ! -d "$PARENT_DIR" ]]; then
+    echo "[ERROR] Parent directory does not exist: $PARENT_DIR" >&2
+    exit 1
+fi
+
+SKILL_DIR="${PARENT_DIR}/${SKILL_NAME}"
+
+# Prevent overwriting existing directory
+if [[ -d "$SKILL_DIR" ]]; then
+    echo "[ERROR] Directory already exists: $SKILL_DIR" >&2
+    echo "Remove it first or choose a different name." >&2
+    exit 1
+fi
+
+# ──────────────────────────────────────────────
+# Scaffold
+# ──────────────────────────────────────────────
+
+mkdir -p "${SKILL_DIR}/references" "${SKILL_DIR}/examples" "${SKILL_DIR}/scripts"
+
+cat > "${SKILL_DIR}/SKILL.md" <<TEMPLATE
+---
+name: ${SKILL_NAME}
+description: >-
+  TODO: One-line description of what this skill does and when to use it.
+  Be specific — this text drives trigger matching in recommend_skills.
+---
+
+# ${SKILL_NAME}
+
+## Overview
+
+TODO: Brief explanation of the problem this skill solves.
+
+**Core principle:** TODO: The single most important rule this skill enforces.
+
+**Iron Law:**
+\`\`\`
+TODO: The non-negotiable constraint. One line.
+\`\`\`
+
+## When to Use
+
+- TODO: Specific scenario 1
+- TODO: Specific scenario 2
+- TODO: Specific scenario 3
+
+## When NOT to Use
+
+- TODO: Anti-scenario 1
+- TODO: Anti-scenario 2
+
+## Process
+
+### Phase 1: TODO
+
+1. Step 1
+2. Step 2
+3. Step 3
+
+### Phase 2: TODO
+
+1. Step 1
+2. Step 2
+3. Step 3
+
+## Checklist
+
+- [ ] TODO: Verification item 1
+- [ ] TODO: Verification item 2
+- [ ] TODO: Verification item 3
+TEMPLATE
+
+echo "[OK] Skill scaffolded: ${SKILL_DIR}"
+echo "     - ${SKILL_DIR}/SKILL.md"
+echo "     - ${SKILL_DIR}/references/"
+echo "     - ${SKILL_DIR}/examples/"
+echo "     - ${SKILL_DIR}/scripts/"
+echo ""
+echo "Next steps:"
+echo "  1. Edit ${SKILL_DIR}/SKILL.md — fill in the TODOs"
+echo "  2. Add supporting files to ${SKILL_DIR}/references/"
+echo "  3. Test with: recommend_skills(prompt='your test prompt')"
diff --git a/packages/rules/.ai-rules/skills/skill-creator/scripts/run_loop.py b/packages/rules/.ai-rules/skills/skill-creator/scripts/run_loop.py
new file mode 100644
index 00000000..ec78dc76
--- /dev/null
+++ b/packages/rules/.ai-rules/skills/skill-creator/scripts/run_loop.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python3
+"""Run a description optimization loop for skill trigger evaluation.
+
+Loads trigger_eval.json, splits cases into 60/40 train/test sets,
+and iterates to optimize the skill description for better trigger accuracy.
+
+LLM calls are replaced with CLI guidance — the script logs scores and
+prompts the user to manually refine the description between iterations.
+
+Usage:
+    python run_loop.py <trigger-eval-json> --skill-name <name> [--iterations N] [--seed S]
+
+Example:
+    python run_loop.py workspace/trigger_eval.json --skill-name tdd --iterations 5
+
+Requirements:
+    Python 3.8+ (standard library only)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import random
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+
+def parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Description optimization loop for skill trigger evaluation. "
+            "Splits trigger_eval.json into train/test sets and guides "
+            "iterative description refinement."
+        ),
+        epilog=(
+            "Example:\n"
+            "  python run_loop.py workspace/trigger_eval.json "
+            "--skill-name tdd --iterations 5\n\n"
+            "The script will guide you through each iteration, prompting\n"
+            "you to run `recommend_skills` manually and enter results."
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "trigger_eval_json",
+        type=str,
+        help="Path to trigger_eval.json file",
+    )
+    parser.add_argument(
+        "--skill-name",
+        required=True,
+        help="Target skill name in kebab-case",
+    )
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=5,
+        help="Number of optimization iterations (default: 5)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for train/test split (default: 42)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Directory to write iteration logs (default: same as trigger_eval.json)",
+    )
+    return parser.parse_args(argv)
+
+
+def _load_trigger_eval(path: Path) -> List[Dict[str, Any]]:
+    """Load and validate trigger_eval.json."""
+    with open(path, encoding="utf-8") as f:
+        data = json.load(f)
+
+    if not isinstance(data, list) or len(data) == 0:
+        raise ValueError(
+            "trigger_eval.json must be a non-empty array of test cases"
+        )
+
+    for i, case in enumerate(data):
+        if "query" not in case or "should_trigger" not in case:
+            raise ValueError(
+                f"Case {i} missing required fields: 'query' and 'should_trigger'"
+            )
+
+    return data
+
+
+def _split_train_test(
+    cases: List[Dict[str, Any]], seed: int
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Split cases into 60% train / 40% test with deterministic shuffle."""
+    rng = random.Random(seed)
+    shuffled = list(cases)
+    rng.shuffle(shuffled)
+    split_idx = max(1, int(len(shuffled) * 0.6))
+    return shuffled[:split_idx], shuffled[split_idx:]
+
+
+def _compute_metrics(
+    results: List[Dict[str, Any]],
+) -> Dict[str, float]:
+    """Compute precision, recall, F1 from trigger results.
+
+    Each result dict has:
+        - should_trigger: bool (ground truth)
+        - triggered: bool (actual result from user input)
+    """
+    tp = sum(1 for r in results if r["should_trigger"] and r["triggered"])
+    fp = sum(1 for r in results if not r["should_trigger"] and r["triggered"])
+    fn = sum(1 for r in results if r["should_trigger"] and not r["triggered"])
+    tn = sum(
+        1 for r in results if not r["should_trigger"] and not r["triggered"]
+    )
+
+    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+    f1 = (
+        2 * precision * recall / (precision + recall)
+        if (precision + recall) > 0
+        else 0.0
+    )
+    accuracy = (tp + tn) / len(results) if results else 0.0
+
+    return {
+        "precision": round(precision, 4),
+        "recall": round(recall, 4),
+        "f1": round(f1, 4),
+        "accuracy": round(accuracy, 4),
+        "tp": tp,
+        "fp": fp,
+        "fn": fn,
+        "tn": tn,
+    }
+
+
+def _prompt_user_results(
+    cases: List[Dict[str, Any]], skill_name: str, label: str
+) -> List[Dict[str, Any]]:
+    """Prompt user to manually run recommend_skills and report results.
+
+    Since LLM calls cannot be made from this script, we guide the user
+    to evaluate each query and enter whether the skill was triggered.
+    """
+    print(f"\n{'=' * 60}")
+    print(f"  Evaluating {label} set ({len(cases)} cases)")
+    print(f"  Target skill: {skill_name}")
+    print(f"{'=' * 60}")
+    print()
+    print("For each query below, run:")
+    print(f"  recommend_skills(prompt=<query>)")
+    print(f"and check if '{skill_name}' appears in the results.")
+    print()
+
+    results: List[Dict[str, Any]] = []
+    for i, case in enumerate(cases):
+        query = case["query"]
+        expected = case["should_trigger"]
+        print(f"  [{i + 1}/{len(cases)}] Query: {query}")
+        print(f"           Expected: {'TRIGGER' if expected else 'NO TRIGGER'}")
+
+        while True:
+            response = input(
+                "           Result? (y=triggered / n=not triggered / s=skip): "
+            ).strip().lower()
+            if response in ("y", "n", "s"):
+                break
+            print("           Invalid input. Enter y, n, or s.")
+
+        if response == "s":
+            print("           -> Skipped")
+            continue
+
+        triggered = response == "y"
+        match = triggered == expected
+        results.append(
+            {
+                "query": query,
+                "should_trigger": expected,
+                "triggered": triggered,
+                "match": match,
+            }
+        )
+        print(f"           -> {'MATCH' if match else 'MISMATCH'}")
+
+    return results
+
+
+def _print_metrics(metrics: Dict[str, float], label: str) -> None:
+    """Pretty-print evaluation metrics."""
+    print(f"\n--- {label} Metrics ---")
+    print(f"  Precision: {metrics['precision']:.2%}")
+    print(f"  Recall:    {metrics['recall']:.2%}")
+    print(f"  F1 Score:  {metrics['f1']:.2%}")
+    print(f"  Accuracy:  {metrics['accuracy']:.2%}")
+    print(
+        f"  (TP={metrics['tp']} FP={metrics['fp']} "
+        f"FN={metrics['fn']} TN={metrics['tn']})"
+    )
+
+
+def main(argv: Optional[List[str]] = None) -> int:
+    args = parse_args(argv)
+    trigger_path = Path(args.trigger_eval_json).resolve()
+
+    if not trigger_path.is_file():
+        print(f"[ERROR] File not found: {trigger_path}", file=sys.stderr)
+        return 1
+
+    output_dir = Path(args.output_dir) if args.output_dir else trigger_path.parent
+    output_dir = output_dir.resolve()
+
+    try:
+        cases = _load_trigger_eval(trigger_path)
+    except (json.JSONDecodeError, ValueError) as exc:
+        print(f"[ERROR] Failed to load trigger_eval.json: {exc}", file=sys.stderr)
+        return 1
+
+    train_set, test_set = _split_train_test(cases, args.seed)
+    print(f"Loaded {len(cases)} cases: {len(train_set)} train / {len(test_set)} test")
+    print(f"Skill: {args.skill_name}")
+    print(f"Iterations: {args.iterations}")
+    print(f"Seed: {args.seed}")
+
+    iteration_log: List[Dict[str, Any]] = []
+
+    for iteration in range(1, args.iterations + 1):
+        print(f"\n{'#' * 60}")
+        print(f"  ITERATION {iteration}/{args.iterations}")
+        print(f"{'#' * 60}")
+
+        # Step 1: Evaluate on train set
+        print("\n[Step 1] Evaluate current description on TRAIN set")
+        train_results = _prompt_user_results(
+            train_set, args.skill_name, f"Iteration {iteration} TRAIN"
+        )
+        train_metrics = _compute_metrics(train_results)
+        _print_metrics(train_metrics, f"Iteration {iteration} TRAIN")
+
+        # Step 2: Guide description refinement
+        print(f"\n[Step 2] Refine the skill description")
+        print("  Based on the train results above, update the skill's")
+        print("  'description' field in SKILL.md frontmatter to improve")
+        print("  trigger accuracy.")
+        print()
+        print("  Mismatched cases to focus on:")
+        mismatches = [r for r in train_results if not r.get("match", True)]
+        if mismatches:
+            for m in mismatches:
+                direction = "should trigger but didn't" if m["should_trigger"] else "triggered but shouldn't"
+                print(f"    - \"{m['query']}\" ({direction})")
+        else:
+            print("    (none — perfect score on train set)")
+        print()
+        input("  Press Enter when description has been updated...")
+
+        # Step 3: Evaluate on test set
+        print("\n[Step 3] Evaluate updated description on TEST set")
+        test_results = _prompt_user_results(
+            test_set, args.skill_name, f"Iteration {iteration} TEST"
+        )
+        test_metrics = _compute_metrics(test_results)
+        _print_metrics(test_metrics, f"Iteration {iteration} TEST")
+
+        # Log iteration
+        entry = {
+            "iteration": iteration,
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "train": {
+                "metrics": train_metrics,
+                "total_cases": len(train_results),
+            },
+            "test": {
+                "metrics": test_metrics,
+                "total_cases": len(test_results),
+            },
+        }
+        iteration_log.append(entry)
+
+        # Check convergence
+        if test_metrics["f1"] >= 1.0:
+            print("\n[INFO] Perfect F1 on test set. Stopping early.")
+            break
+
+    # Write iteration log
+    log_path = output_dir / f"optimization_log_{args.skill_name}.json"
+    with open(log_path, "w", encoding="utf-8") as f:
+        json.dump(
+            {
+                "skill_name": args.skill_name,
+                "seed": args.seed,
+                "train_size": len(train_set),
+                "test_size": len(test_set),
+                "iterations": iteration_log,
+            },
+            f,
+            indent=2,
+            ensure_ascii=False,
+        )
+    print(f"\n[OK] Optimization log written: {log_path}")
+
+    # Final summary
+    print(f"\n{'=' * 60}")
+    print("  OPTIMIZATION SUMMARY")
+    print(f"{'=' * 60}")
+    print(f"\n  {'Iter':>4}  {'Train F1':>10}  {'Test F1':>10}  {'Test Acc':>10}")
+    print(f"  {'----':>4}  {'--------':>10}  {'-------':>10}  {'--------':>10}")
+    for entry in iteration_log:
+        it = entry["iteration"]
+        tf1 = entry["train"]["metrics"]["f1"]
+        sf1 = entry["test"]["metrics"]["f1"]
+        sacc = entry["test"]["metrics"]["accuracy"]
+        print(f"  {it:>4}  {tf1:>10.2%}  {sf1:>10.2%}  {sacc:>10.2%}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())