NetziTech · h2devx · May 2, 2026 · May 2, 2026
diff --git a/code/src/composition/facades/mcp-server-facades.ts b/code/src/composition/facades/mcp-server-facades.ts
@@ -321,7 +321,16 @@ export class GetContextFacadeAdapter implements GetContextFacade {
  */
 export class RecallMemoryFacadeAdapter implements RecallMemoryFacade {
   private static readonly DEFAULT_TOP_K = 8;
-  private static readonly DEFAULT_MAX_TOKENS = 4000;
+  /**
+   * Default token budget for `mem.recall` when the caller omits
+   * `max_tokens`. Aligned with `GetContextFacadeAdapter` (8000) for
+   * consistency: a recall request typically returns full ranked entries
+   * with previews, so a tighter budget than the bundle would force the
+   * use case to drop hits that the user expected. See B-MCP-8 (issue
+   * #31) — the previous 4000 default left zero hits on the dogfood DB
+   * for queries whose top-ranked entry exceeded the budget alone.
+   */
+  private static readonly DEFAULT_MAX_TOKENS = 8000;
 
   public constructor(
     private readonly useCase: RecallMemory,

diff --git a/code/src/modules/retrieval/application/use-cases/recall-memory.use-case.ts b/code/src/modules/retrieval/application/use-cases/recall-memory.use-case.ts
@@ -357,14 +357,30 @@ export class RecallMemoryUseCase implements RecallMemory {
 
     const limited = ranked.slice(0, input.filters.limit);
 
+    // Token budget application (B-MCP-8 fix):
+    //
+    // 1. The top-ranked hit is ALWAYS included even if it solo exceeds
+    //    `maxTokens`. Returning zero hits when there are candidates
+    //    surprises callers ("recall said total_candidates=2 but hits=0")
+    //    and degrades the semantic-recall promise; a slightly oversized
+    //    single result is strictly more useful than no result.
+    // 2. Subsequent hits use `continue` (not `break`) so a mid-loop
+    //    oversized hit doesn't suppress smaller hits later in the
+    //    ranking. Candidates are ordered by relevance, not size, so a
+    //    big hit at rank 3 should not hide a fitting hit at rank 4.
     const out: RankedEntry[] = [];
     let runningTokens = 0;
     const max = input.maxTokens.maxTokens;
     for (const candidate of limited) {
       const tokens = this.tokenCounter
         .count(this.renderTokenInput(candidate.entry))
         .toNumber();
-      if (runningTokens + tokens > max) break;
+      if (out.length === 0) {
+        out.push(candidate.entry);
+        runningTokens += tokens;
+        continue;
+      }
+      if (runningTokens + tokens > max) continue;
       runningTokens += tokens;
       out.push(candidate.entry);
     }

diff --git a/code/tests/integration/D-mem-recall.test.ts b/code/tests/integration/D-mem-recall.test.ts
@@ -259,4 +259,62 @@ describe("integration / D / mem.recall — hybrid retrieval", () => {
       expect(r.score).toBeGreaterThanOrEqual(median);
     }
   });
+
+  // B-MCP-8 (issue #31): the recall pipeline used to return
+  // `total_candidates>0` but `hits=0` whenever the top-ranked entry
+  // alone exceeded `max_tokens`. The dogfood DB tripped this because
+  // learning rows store the full content un-truncated (decisions and
+  // entities truncate at 600 chars; learnings and turns do not). The
+  // fix in `RecallMemoryUseCase.rankAndSlice` always includes the top
+  // hit and uses `continue` (not `break`) on overflow.
+  it("returns the top hit even when its preview alone exceeds maxTokens (B-MCP-8)", async () => {
+    // Seed a learning under the 2000-char domain cap (LearningText
+    // limit) but well over a tight token budget. With cl100k_base
+    // every ~3-4 chars is a token, so ~1800 chars ≈ 500+ tokens.
+    // Picking max_tokens=50 forces the top-ranked hit to overflow
+    // alone — exactly the dogfood scenario reported in the issue.
+    const longContent = `${"GitFlow".padEnd(200, " strict ")}\n`.repeat(9);
+    expect(longContent.length).toBeGreaterThan(1500);
+    expect(longContent.length).toBeLessThanOrEqual(2000);
+    await ctx.memory.recordLearning.record({
+      workspaceId: ctx.workspaceId,
+      text: longContent,
+      severity: LearningSeverity.warning(),
+      tags: Tags.create(["gitflow"]),
+      scope: Scope.project(),
+    });
+
+    const out = await ctx.mcpServer.useCases.recall.recall({
+      workspace_id: ctx.workspaceId.toString(),
+      query: "GitFlow",
+      top_k: 5,
+      max_tokens: 50,
+    });
+
+    // The bug surfaced exactly as the issue describes:
+    //   total_candidates >= 1   but   hits.length == 0
+    // The fix guarantees the top-ranked hit is always returned.
+    expect(out.total_candidates).toBeGreaterThanOrEqual(1);
+    expect(out.results.length).toBeGreaterThanOrEqual(1);
+    // The first surfaced hit must mention the query token in its
+    // preview (rules out a coincidental match on a different entry).
+    const firstPreview = out.results[0]?.content.toLowerCase() ?? "";
+    expect(firstPreview).toContain("gitflow");
+  });
+
+  it("default max_tokens (8000) lets multiple hits surface for a literal query (B-MCP-8)", async () => {
+    // Without an explicit `max_tokens`, the wire facade now applies
+    // 8000 (was 4000 pre-fix, which silently capped the dogfood).
+    // The seeded corpus has two `hexagonal` matches; both must come
+    // back so the caller sees the full set, not just the top one.
+    const out = await ctx.mcpServer.useCases.recall.recall({
+      workspace_id: ctx.workspaceId.toString(),
+      query: "hexagonal",
+      top_k: 5,
+      // max_tokens deliberately omitted to exercise the default.
+    });
+
+    expect(out.results.length).toBeGreaterThanOrEqual(2);
+    expect(out.total_candidates).toBeGreaterThanOrEqual(out.results.length);
+  });
 });
diff --git a/code/tests/unit/retrieval/application/recall-memory.use-case.test.ts b/code/tests/unit/retrieval/application/recall-memory.use-case.test.ts
@@ -382,7 +382,7 @@ describe("RecallMemoryUseCase.recall", () => {
       weights: RelevanceWeights.defaults(),
     });
 
-    expect(result.getEntries().length).toBeLessThanOrEqual(2);
+    expect(result.getEntries().length).toBe(2);
     expect(result.totalCandidates).toBe(5);
   });
 
@@ -414,10 +414,91 @@ describe("RecallMemoryUseCase.recall", () => {
       weights: RelevanceWeights.defaults(),
     });
 
-    // Token counter is len/4 → ~200 tokens per entry; with 250 budget we
-    // should fit at most 1.
-    expect(result.getEntries().length).toBeLessThanOrEqual(1);
-    expect(result.totalTokens.toNumber()).toBeLessThanOrEqual(250);
+    // Each entry is ~200 tokens (title 400 + "\n" + preview 400 = 801
+    // chars / 4). Budget is 250. Top-ranked hit is always included
+    // (B-MCP-8 guarantee), and the next would put cumulative over 250
+    // (200 + 200 = 400 > 250) so the rest are skipped via `continue`.
+    expect(result.getEntries().length).toBe(1);
+    expect(result.getEntries()[0]?.id).toBe(
+      "01952f3b-7d8c-7000-8000-000000000001",
+    );
+  });
+
+  it("always includes the top-ranked hit even when it solo exceeds maxTokens (B-MCP-8)", async () => {
+    // Reproduces the bug observed against the dogfood DB on
+    // `@netzi/recall@0.1.2-beta.4`: a query found `total_candidates>0`
+    // but the recall returned zero hits because the top-ranked
+    // candidate's preview alone was larger than the default token
+    // budget. The token counter here is `len/4`, so an 8000-character
+    // entry costs 2000 tokens — twice the 1000-token budget below.
+    const huge = "Z".repeat(4000);
+    const tiny = "ok";
+    projections.projections = [
+      projection({ kind: "decision", id: ID_A, title: huge, preview: huge }),
+      projection({ kind: "decision", id: ID_B, title: tiny, preview: tiny }),
+    ];
+    lexical.hits = [
+      { kind: "decision", id: ID_A, score: BM25Score.of(2.0) },
+      { kind: "decision", id: ID_B, score: BM25Score.of(0.5) },
+    ];
+
+    const result = await useCase.recall({
+      workspaceId: makeWorkspaceId(),
+      query: makeQuery(),
+      filters: makeFilters(5),
+      maxTokens: TokenBudget.withMax(1000),
+      weights: RelevanceWeights.defaults(),
+    });
+
+    expect(result.totalCandidates).toBe(2);
+    const entries = result.getEntries();
+    // The top-ranked hit must come back even though it alone exceeds
+    // the budget — returning zero hits when there ARE candidates was
+    // exactly the regression behaviour reported in B-MCP-8. Subsequent
+    // hits are not expected to fit here because the top hit's tokens
+    // already drove `runningTokens` past the budget; the second test
+    // ("skips a mid-ranking oversized hit ...") covers the
+    // continue-vs-break case where the top hit DOES fit and a mid
+    // candidate is the one that overflows.
+    expect(entries.length).toBe(1);
+    expect(entries[0]?.id).toBe(ID_A);
+  });
+
+  it("skips a mid-ranking oversized hit and still includes smaller ones behind it (B-MCP-8)", async () => {
+    // Three candidates ranked by BM25:
+    //   1. ID_A  — small (fits)
+    //   2. ID_B  — huge (does NOT fit on its own AFTER ID_A)
+    //   3. ID_C  — small (would still fit if we don't `break` on ID_B)
+    // The pre-fix `break` semantics returned only [ID_A]; the fix uses
+    // `continue` so ID_C surfaces too.
+    const ID_C = "01952f3b-7d8c-7000-8000-aaaaaaaaaa03";
+    const huge = "Z".repeat(8000); // 8001 chars → ~2001 tokens alone
+    const tiny = "ok";
+    projections.projections = [
+      projection({ kind: "decision", id: ID_A, title: tiny, preview: tiny }),
+      projection({ kind: "decision", id: ID_B, title: huge, preview: huge }),
+      projection({ kind: "decision", id: ID_C, title: tiny, preview: tiny }),
+    ];
+    lexical.hits = [
+      { kind: "decision", id: ID_A, score: BM25Score.of(3.0) },
+      { kind: "decision", id: ID_B, score: BM25Score.of(2.0) },
+      { kind: "decision", id: ID_C, score: BM25Score.of(1.0) },
+    ];
+
+    const result = await useCase.recall({
+      workspaceId: makeWorkspaceId(),
+      query: makeQuery(),
+      filters: makeFilters(5),
+      maxTokens: TokenBudget.withMax(500),
+      weights: RelevanceWeights.defaults(),
+    });
+
+    const entries = result.getEntries();
+    const ids = entries.map((e) => e.id);
+    expect(ids).toContain(ID_A);
+    expect(ids).toContain(ID_C);
+    expect(ids).not.toContain(ID_B);
+    expect(entries.length).toBe(2);
   });
 
   it("calls bumpUsage after a successful recall", async () => {