MemTensor · hijzy · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/apps/memos-local-plugin/core/llm/prompts/skill-crystallize.ts b/apps/memos-local-plugin/core/llm/prompts/skill-crystallize.ts
@@ -8,24 +8,27 @@ import type { PromptDef } from "./index.js";
  * it into a callable "Skill" with a stable name, parameter schema, and a
  * small SKILL.md authored from the evidence.
  *
- * **v2** (this version) extends the schema with `decision_guidance`:
- * preference + anti-pattern lists distilled from past failures + fixes
- * (V7 §2.4.6). The crystallizer now sees the policy's `@repair` block
- * (parsed by the orchestrator from `policy.boundary`) plus high-V vs
- * low-V evidence traces, so the LLM can write concrete "prefer X /
- * avoid Y" lines that ship with the skill. Bumping the version captures
- * that shape change so the LLM-mock op tags refresh too.
+ * **v3** adds an explicit `tools` output field: the LLM must declare which
+ * tools/commands the skill invokes, constrained to the `EVIDENCE_TOOLS`
+ * whitelist extracted from evidence trace `toolCalls`. This replaces the
+ * old regex-based command-token heuristic in the verifier — coverage is now
+ * a clean set-containment check (`draft.tools ⊆ evidenceTools`).
+ *
+ * v2 history: added `decision_guidance` (preference + anti-pattern).
  */
 export const SKILL_CRYSTALLIZE_PROMPT: PromptDef = {
   id: "skill.crystallize",
-  version: 2,
+  version: 3,
   description:
     "Turn a graduated L2 policy into a callable Skill definition, including decision guidance distilled from past prefer/avoid signals.",
   system: `You crystallize a skill an agent should be able to call.
 
 Input:
 - POLICY: the L2 policy being promoted (trigger / action / rationale / caveats).
 - EVIDENCE: 3..10 successful traces that support the policy.
+- EVIDENCE_TOOLS: the exhaustive list of tool/command names that actually
+  appeared in the evidence traces' tool calls. This is the ground-truth
+  whitelist — your \`tools\` output MUST be a subset of this list.
 - COUNTER_EXAMPLES (optional): traces with V < 0 from the same context —
   failures the policy is meant to prevent.
 - REPAIR_HINTS (optional): a JSON block { preference: [...], antiPattern: [...] }
@@ -50,6 +53,7 @@ Return JSON:
   "examples": [
     { "input": "...", "expected": "..." }
   ],
+  "tools": ["tool_or_command_name", ...],
   "decision_guidance": {
     "preference":   ["Prefer: …", ...],   // concrete actions to favour, ≤ 5
     "anti_pattern": ["Avoid: …", ...]     // concrete actions to avoid, ≤ 5
@@ -58,7 +62,9 @@ Return JSON:
 }
 
 Rules:
-- Only reference tools/APIs that appear in EVIDENCE.
+- \`tools\` MUST only contain names from EVIDENCE_TOOLS. Never invent tool
+  names that are not in the whitelist. Include every tool the skill's
+  procedure actually invokes — omit tools not referenced in your steps.
 - Keep "steps" short (2-6 items).
 - \`summary\` must be self-contained so the agent can decide whether to
   call this skill without reading the full SKILL.md.

diff --git a/apps/memos-local-plugin/core/skill/ALGORITHMS.md b/apps/memos-local-plugin/core/skill/ALGORITHMS.md
@@ -111,22 +111,26 @@ short-circuit with `skill.failed { reason: "llm_disabled" }`.
 `verifyDraft` (see [`verifier.ts`](./verifier.ts)) runs two
 deterministic checks on the draft:
 
-### Consistency coverage
+### Tool coverage
 
 ```
-commandLike = tokens that look like shell commands / module paths
-              (e.g. "apk add", "docker.build", "rg", "pip install")
-coverage = |commandLike ∩ actionBlob| / |commandLike|
+evidenceTools = extractToolNames(evidence)   // from trace.toolCalls
+draftTools    = draft.tools                  // declared by LLM
+coverage      = |draftTools ∩ evidenceTools| / |draftTools|
 ```
 
-* `actionBlob` is the concatenation of every trace's
-  `userText + agentText + reflection`, lowercased.
-* `commandLike` is extracted from the draft's `steps` and `examples`
-  via a regex that catches backticked tokens and `name.method`-style
-  identifiers of length ≥ 3.
-* Stopwords (`the`, `with`, `steps`, …) are filtered out.
+* `evidenceTools` is built from the structured `toolCalls` field on
+  each evidence trace — `tc.name` (tool-level, e.g. "shell",
+  "pip.install") plus the first token of `tc.input` when it's a string
+  (command-level, e.g. "apk" from "apk add openssl-dev"). See
+  [`tool-names.ts`](./tool-names.ts).
+* `draftTools` is the `tools: string[]` array the LLM outputs during
+  crystallization (prompt v3). The LLM is constrained to pick from an
+  `EVIDENCE_TOOLS` whitelist injected into the prompt payload.
+* No regex heuristics, no stopwords — tool identity comes from ground-
+  truth structured data, not from guessing in natural-language text.
 
-Verdict: `ok = coverage ≥ 0.5 || commandLike.length === 0`.
+Verdict: `ok = coverage ≥ 0.5 || draftTools.length === 0`.
 
 ### Evidence resonance
 
@@ -137,9 +141,9 @@ resonance   = |{ trace : |tokens(trace) ∩ draftTokens| ≥ 2 }| / |evidence|
 
 Verdict: `ok = resonance ≥ minResonance` (default `0.5`).
 
-Both checks are cheap string matching by design — this is not a
-security gate, it is a first-line defence against hallucinated tool
-names. A skill can still be wrong about intent; the trial cycle is
+Both checks are cheap and deterministic — no LLM calls. Tool coverage
+catches hallucinated tool/command names; resonance catches narrative
+drift. A skill can still be wrong about intent; the trial cycle is
 what catches that.
 
 A rejected draft emits `skill.verification.failed` and the policy

diff --git a/apps/memos-local-plugin/core/skill/crystallize.ts b/apps/memos-local-plugin/core/skill/crystallize.ts
@@ -17,6 +17,7 @@ import {
 import { SKILL_CRYSTALLIZE_PROMPT } from "../llm/prompts/skill-crystallize.js";
 import type { Logger } from "../logger/types.js";
 import type { PolicyRow, SkillRow, TraceRow } from "../types.js";
+import { extractToolNames } from "./tool-names.js";
 import type {
   SkillConfig,
   SkillCrystallizationDraft,
@@ -158,9 +159,12 @@ function packPrompt(input: CrystallizeInput, config: SkillConfig): string {
       tags: t.tags,
     }));
 
+  const evidenceTools = Array.from(extractToolNames(input.evidence));
+
   const payload: Record<string, unknown> = {
     policy,
     evidence,
+    evidence_tools: evidenceTools,
     naming_space: input.namingSpace,
   };
   if (counterExamples.length > 0) payload.counter_examples = counterExamples;
@@ -202,6 +206,8 @@ function normaliseDraft(
   // to keep the skill body skim-able and the prompt budget bounded.
   const decisionGuidance = coerceDecisionGuidance(raw.decision_guidance ?? raw.decisionGuidance);
 
+  const tools = dedupeLc(asStringArray(raw.tools));
+
   return {
     name,
     displayTitle,
@@ -212,6 +218,7 @@ function normaliseDraft(
     examples,
     tags,
     decisionGuidance,
+    tools,
   };
 }
 

diff --git a/apps/memos-local-plugin/core/skill/index.ts b/apps/memos-local-plugin/core/skill/index.ts
@@ -42,6 +42,7 @@ export {
 } from "./skill.js";
 export { attachSkillSubscriber, type SkillSubscriberDeps, type SkillSubscriberHandle } from "./subscriber.js";
 export { createSkillEventBus } from "./events.js";
+export { extractToolNames } from "./tool-names.js";
 export {
   verifyDraft,
   type VerifyDeps,

diff --git a/apps/memos-local-plugin/core/skill/packager.ts b/apps/memos-local-plugin/core/skill/packager.ts
@@ -133,12 +133,9 @@ function buildProcedure(draft: SkillCrystallizationDraft): SkillProcedure {
     preconditions: draft.preconditions,
     steps: draft.steps,
     examples: draft.examples,
-    // V7 §2.4.6 — `decisionGuidance` now flows from the LLM draft
-    // (which folded in the policy's `@repair` block + V-contrast
-    // signals). Older code path used to hard-code `[] / []` here,
-    // dropping every prefer / avoid line on the floor.
     decisionGuidance: draft.decisionGuidance ?? { preference: [], antiPattern: [] },
     tags: draft.tags,
+    tools: draft.tools ?? [],
   };
 }
 
@@ -184,6 +181,11 @@ function renderInvocationGuide(
     }
     lines.push("");
   }
+  if (draft.tools && draft.tools.length > 0) {
+    lines.push(`**Tools used**`);
+    for (const t of draft.tools) lines.push(`- \`${t}\``);
+    lines.push("");
+  }
   const dg = draft.decisionGuidance;
   if (dg && (dg.preference.length > 0 || dg.antiPattern.length > 0)) {
     lines.push(`**Decision guidance**`);

diff --git a/apps/memos-local-plugin/core/skill/tool-names.ts b/apps/memos-local-plugin/core/skill/tool-names.ts
@@ -0,0 +1,30 @@
+/**
+ * Extract the set of tool / command names actually invoked in a batch of
+ * traces, using the structured `ToolCallDTO` data rather than regex
+ * heuristics on natural-language text.
+ *
+ * Two levels of extraction:
+ *   1. `tc.name` — the tool-level identifier (e.g. "shell", "pip.install").
+ *   2. First token of `tc.input` when input is a string — the command-level
+ *      identifier for shell-like tools (e.g. "apk" from "apk add openssl-dev").
+ */
+
+import type { TraceRow } from "../types.js";
+
+const IGNORED_NAMES = new Set(["unknown", "unknown_tool"]);
+
+export function extractToolNames(traces: readonly TraceRow[]): Set<string> {
+  const out = new Set<string>();
+  for (const t of traces) {
+    for (const tc of t.toolCalls) {
+      const name = tc.name?.trim().toLowerCase();
+      if (name && !IGNORED_NAMES.has(name)) out.add(name);
+
+      if (typeof tc.input === "string") {
+        const first = tc.input.trim().split(/\s+/)[0]?.toLowerCase();
+        if (first && first.length >= 2) out.add(first);
+      }
+    }
+  }
+  return out;
+}
diff --git a/apps/memos-local-plugin/core/skill/types.ts b/apps/memos-local-plugin/core/skill/types.ts
@@ -54,6 +54,8 @@ export interface SkillProcedure {
   examples: SkillExampleDraft[];
   decisionGuidance: { preference: string[]; antiPattern: string[] };
   tags: string[];
+  /** Tool names this skill references (from evidence toolCalls). */
+  tools: string[];
 }
 
 /**
@@ -76,6 +78,8 @@ export interface SkillCrystallizationDraft {
   examples: SkillExampleDraft[];
   tags: string[];
   decisionGuidance: { preference: string[]; antiPattern: string[] };
+  /** Tool names this skill references. Must be a subset of evidence tool names. */
+  tools: string[];
 }
 
 /**

diff --git a/apps/memos-local-plugin/core/skill/verifier.ts b/apps/memos-local-plugin/core/skill/verifier.ts
@@ -2,16 +2,16 @@
  * V7 §2.5.3 — Consistency + integration verification for a freshly minted
  * skill.
  *
- * Two checks, both heuristic and deterministic — no LLM calls. The goal is
- * to catch obvious drafts that should never surface in Tier-1 retrieval
- * (e.g. the LLM invented a tool name not present in any evidence, or the
- * steps don't cover the originating sub-problem).
+ * Two checks, both deterministic — no LLM calls:
+ *
+ * 1. **Tool coverage**: every tool name declared in `draft.tools` must
+ *    appear in the evidence traces' structured `toolCalls`. Coverage is a
+ *    simple set-containment check — `draft.tools ⊆ evidenceTools`. This
+ *    catches the most common LLM hallucination: inventing tool/command
+ *    names that never appeared in any evidence trace.
  *
- * 1. **Consistency coverage**: every non-empty step title / body token that
- *    looks like a tool / command identifier (`rg`, `git_diff`, `docker.ps`)
- *    must appear in at least one evidence trace's action text.
  * 2. **Evidence resonance**: at least `minResonance` fraction of the
- *    evidence traces should share ≥ one token with the skill's summary or
+ *    evidence traces should share ≥ 2 tokens with the skill's summary or
  *    steps. Prevents a skill whose narrative contradicts the examples.
  *
  * The check returns a verdict; the caller (orchestrator) decides whether to
@@ -22,6 +22,7 @@
 import type { Logger } from "../logger/types.js";
 import type { TraceRow } from "../types.js";
 import type { SkillCrystallizationDraft } from "./types.js";
+import { extractToolNames } from "./tool-names.js";
 
 export interface VerifyInput {
   draft: SkillCrystallizationDraft;
@@ -59,23 +60,22 @@ export function verifyDraft(
     };
   }
 
-  const actionBlob = evidence
-    .flatMap((t) => [t.agentText, t.userText, t.reflection ?? ""])
-    .join("\n")
-    .toLowerCase();
-
-  const commandLike = collectCommandTokens(draft);
+  // --- Tool coverage (structured set comparison) ---
+  const evidenceTools = extractToolNames(evidence);
+  const draftTools = (draft.tools ?? []).map((t) => t.toLowerCase());
   const matched: string[] = [];
   const unmapped: string[] = [];
-  for (const tok of commandLike) {
-    if (actionBlob.includes(tok)) matched.push(tok);
+  for (const tok of draftTools) {
+    if (evidenceTools.has(tok)) matched.push(tok);
     else unmapped.push(tok);
   }
-  const coverage = commandLike.length === 0 ? 1 : matched.length / commandLike.length;
+  const coverage =
+    draftTools.length === 0 ? 1 : matched.length / draftTools.length;
 
+  // --- Evidence resonance (unchanged) ---
   const resonance = computeResonance(draft, evidence);
 
-  if (coverage < 0.5 && commandLike.length > 0) {
+  if (coverage < 0.5 && draftTools.length > 0) {
     deps.log.warn("skill.verify.fail", { reason: "coverage-low", coverage });
     return {
       ok: false,
@@ -100,25 +100,9 @@ export function verifyDraft(
   return { ok: true, coverage, resonance, unmappedTokens: unmapped };
 }
 
-function collectCommandTokens(draft: SkillCrystallizationDraft): string[] {
-  const fields = [
-    ...draft.steps.flatMap((s) => [s.title, s.body]),
-    ...draft.examples.flatMap((e) => [e.input, e.expected]),
-  ].join(" ");
-  const matches = fields.match(/`([^`]+)`|([a-z][a-z0-9_]{1,}\.[a-z][a-z0-9_]+|[a-z_]{3,}\b)/gi) ?? [];
-  const out: string[] = [];
-  const seen = new Set<string>();
-  for (const raw of matches) {
-    const tok = raw.replace(/`/g, "").toLowerCase().trim();
-    if (tok.length < 3) continue;
-    if (STOPWORDS.has(tok)) continue;
-    if (!seen.has(tok)) {
-      seen.add(tok);
-      out.push(tok);
-    }
-  }
-  return out;
-}
+// ---------------------------------------------------------------------------
+// Resonance
+// ---------------------------------------------------------------------------
 
 function computeResonance(
   draft: SkillCrystallizationDraft,
@@ -145,18 +129,12 @@ function computeResonance(
 
 function tokensOf(s: string): Set<string> {
   const out = new Set<string>();
-  // ASCII identifier-ish tokens (length ≥ 4 incl. leading char).
   const asciiMatches = s.match(/[a-z0-9_][a-z0-9_./-]{3,}/g) ?? [];
   for (const m of asciiMatches) {
     const tok = m.toLowerCase();
-    if (STOPWORDS.has(tok)) continue;
+    if (RESONANCE_STOPWORDS.has(tok)) continue;
     out.add(tok);
   }
-  // CJK support: Chinese / Japanese / Korean text isn't whitespace-separated,
-  // so the ASCII-only tokenizer above produces an empty set on Chinese
-  // evidence and the resonance check rejects every skill (the famous
-  // "resonance=0.00<0.5" failure). Add character bigrams from any CJK runs
-  // to give the verifier a reasonable signal in non-Latin contexts.
   const cjkRuns = s.match(/[\u4e00-\u9fff\u3040-\u30ff\u3400-\u4dbf]{2,}/g) ?? [];
   for (const run of cjkRuns) {
     for (let i = 0; i + 1 < run.length; i++) {
@@ -166,7 +144,7 @@ function tokensOf(s: string): Set<string> {
   return out;
 }
 
-const STOPWORDS = new Set([
+const RESONANCE_STOPWORDS = new Set([
   "the", "and", "for", "with", "that", "this", "from", "will", "then",
   "into", "when", "what", "where", "your", "user", "agent", "null", "true",
   "false", "none", "let", "new", "old", "use", "used", "have", "has", "its",