Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion code/src/composition/facades/mcp-server-facades.ts
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,16 @@ export class GetContextFacadeAdapter implements GetContextFacade {
*/
export class RecallMemoryFacadeAdapter implements RecallMemoryFacade {
private static readonly DEFAULT_TOP_K = 8;
private static readonly DEFAULT_MAX_TOKENS = 4000;
/**
* Default token budget for `mem.recall` when the caller omits
* `max_tokens`. Aligned with `GetContextFacadeAdapter` (8000) for
* consistency: a recall request typically returns full ranked entries
* with previews, so a tighter budget than the bundle would force the
* use case to drop hits that the user expected. See B-MCP-8 (issue
* #31) — the previous 4000 default left zero hits on the dogfood DB
* for queries whose top-ranked entry exceeded the budget alone.
*/
private static readonly DEFAULT_MAX_TOKENS = 8000;

public constructor(
private readonly useCase: RecallMemory,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -357,14 +357,30 @@ export class RecallMemoryUseCase implements RecallMemory {

const limited = ranked.slice(0, input.filters.limit);

// Token budget application (B-MCP-8 fix):
//
// 1. The top-ranked hit is ALWAYS included even if it solo exceeds
// `maxTokens`. Returning zero hits when there are candidates
// surprises callers ("recall said total_candidates=2 but hits=0")
// and degrades the semantic-recall promise; a slightly oversized
// single result is strictly more useful than no result.
// 2. Subsequent hits use `continue` (not `break`) so a mid-loop
// oversized hit doesn't suppress smaller hits later in the
// ranking. Candidates are ordered by relevance, not size, so a
// big hit at rank 3 should not hide a fitting hit at rank 4.
const out: RankedEntry[] = [];
let runningTokens = 0;
const max = input.maxTokens.maxTokens;
for (const candidate of limited) {
const tokens = this.tokenCounter
.count(this.renderTokenInput(candidate.entry))
.toNumber();
if (runningTokens + tokens > max) break;
if (out.length === 0) {
out.push(candidate.entry);
runningTokens += tokens;
continue;
}
if (runningTokens + tokens > max) continue;
runningTokens += tokens;
out.push(candidate.entry);
}
Expand Down
58 changes: 58 additions & 0 deletions code/tests/integration/D-mem-recall.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -259,4 +259,62 @@ describe("integration / D / mem.recall — hybrid retrieval", () => {
expect(r.score).toBeGreaterThanOrEqual(median);
}
});

// B-MCP-8 (issue #31): the recall pipeline used to return
// `total_candidates>0` but `hits=0` whenever the top-ranked entry
// alone exceeded `max_tokens`. The dogfood DB tripped this because
// learning rows store the full content un-truncated (decisions and
// entities truncate at 600 chars; learnings and turns do not). The
// fix in `RecallMemoryUseCase.rankAndSlice` always includes the top
// hit and uses `continue` (not `break`) on overflow.
it("returns the top hit even when its preview alone exceeds maxTokens (B-MCP-8)", async () => {
// Seed a learning under the 2000-char domain cap (LearningText
// limit) but well over a tight token budget. With cl100k_base
// every ~3-4 chars is a token, so ~1800 chars ≈ 500+ tokens.
// Picking max_tokens=50 forces the top-ranked hit to overflow
// alone — exactly the dogfood scenario reported in the issue.
const longContent = `${"GitFlow".padEnd(200, " strict ")}\n`.repeat(9);
expect(longContent.length).toBeGreaterThan(1500);
expect(longContent.length).toBeLessThanOrEqual(2000);
await ctx.memory.recordLearning.record({
workspaceId: ctx.workspaceId,
text: longContent,
severity: LearningSeverity.warning(),
tags: Tags.create(["gitflow"]),
scope: Scope.project(),
});

const out = await ctx.mcpServer.useCases.recall.recall({
workspace_id: ctx.workspaceId.toString(),
query: "GitFlow",
top_k: 5,
max_tokens: 50,
});

// The bug surfaced exactly as the issue describes:
// total_candidates >= 1 but hits.length == 0
// The fix guarantees the top-ranked hit is always returned.
expect(out.total_candidates).toBeGreaterThanOrEqual(1);
expect(out.results.length).toBeGreaterThanOrEqual(1);
// The first surfaced hit must mention the query token in its
// preview (rules out a coincidental match on a different entry).
const firstPreview = out.results[0]?.content.toLowerCase() ?? "";
expect(firstPreview).toContain("gitflow");
});

it("default max_tokens (8000) lets multiple hits surface for a literal query (B-MCP-8)", async () => {
// Without an explicit `max_tokens`, the wire facade now applies
// 8000 (was 4000 pre-fix, which silently capped the dogfood).
// The seeded corpus has two `hexagonal` matches; both must come
// back so the caller sees the full set, not just the top one.
const out = await ctx.mcpServer.useCases.recall.recall({
workspace_id: ctx.workspaceId.toString(),
query: "hexagonal",
top_k: 5,
// max_tokens deliberately omitted to exercise the default.
});

expect(out.results.length).toBeGreaterThanOrEqual(2);
expect(out.total_candidates).toBeGreaterThanOrEqual(out.results.length);
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ describe("RecallMemoryUseCase.recall", () => {
weights: RelevanceWeights.defaults(),
});

expect(result.getEntries().length).toBeLessThanOrEqual(2);
expect(result.getEntries().length).toBe(2);
expect(result.totalCandidates).toBe(5);
});

Expand Down Expand Up @@ -414,10 +414,91 @@ describe("RecallMemoryUseCase.recall", () => {
weights: RelevanceWeights.defaults(),
});

// Token counter is len/4 → ~200 tokens per entry; with 250 budget we
// should fit at most 1.
expect(result.getEntries().length).toBeLessThanOrEqual(1);
expect(result.totalTokens.toNumber()).toBeLessThanOrEqual(250);
// Each entry is ~200 tokens (title 400 + "\n" + preview 400 = 801
// chars / 4). Budget is 250. Top-ranked hit is always included
// (B-MCP-8 guarantee), and the next would put cumulative over 250
// (200 + 200 = 400 > 250) so the rest are skipped via `continue`.
expect(result.getEntries().length).toBe(1);
expect(result.getEntries()[0]?.id).toBe(
"01952f3b-7d8c-7000-8000-000000000001",
);
});

it("always includes the top-ranked hit even when it solo exceeds maxTokens (B-MCP-8)", async () => {
// Reproduces the bug observed against the dogfood DB on
// `@netzi/recall@0.1.2-beta.4`: a query found `total_candidates>0`
// but the recall returned zero hits because the top-ranked
// candidate's preview alone was larger than the default token
// budget. The token counter here is `len/4`, so an 8000-character
// entry costs 2000 tokens — twice the 1000-token budget below.
const huge = "Z".repeat(4000);
const tiny = "ok";
projections.projections = [
projection({ kind: "decision", id: ID_A, title: huge, preview: huge }),
projection({ kind: "decision", id: ID_B, title: tiny, preview: tiny }),
];
lexical.hits = [
{ kind: "decision", id: ID_A, score: BM25Score.of(2.0) },
{ kind: "decision", id: ID_B, score: BM25Score.of(0.5) },
];

const result = await useCase.recall({
workspaceId: makeWorkspaceId(),
query: makeQuery(),
filters: makeFilters(5),
maxTokens: TokenBudget.withMax(1000),
weights: RelevanceWeights.defaults(),
});

expect(result.totalCandidates).toBe(2);
const entries = result.getEntries();
// The top-ranked hit must come back even though it alone exceeds
// the budget — returning zero hits when there ARE candidates was
// exactly the regression behaviour reported in B-MCP-8. Subsequent
// hits are not expected to fit here because the top hit's tokens
// already drove `runningTokens` past the budget; the second test
// ("skips a mid-ranking oversized hit ...") covers the
// continue-vs-break case where the top hit DOES fit and a mid
// candidate is the one that overflows.
expect(entries.length).toBe(1);
expect(entries[0]?.id).toBe(ID_A);
});

it("skips a mid-ranking oversized hit and still includes smaller ones behind it (B-MCP-8)", async () => {
// Three candidates ranked by BM25:
// 1. ID_A — small (fits)
// 2. ID_B — huge (does NOT fit on its own AFTER ID_A)
// 3. ID_C — small (would still fit if we don't `break` on ID_B)
// The pre-fix `break` semantics returned only [ID_A]; the fix uses
// `continue` so ID_C surfaces too.
const ID_C = "01952f3b-7d8c-7000-8000-aaaaaaaaaa03";
const huge = "Z".repeat(8000); // 8001 chars → ~2001 tokens alone
const tiny = "ok";
projections.projections = [
projection({ kind: "decision", id: ID_A, title: tiny, preview: tiny }),
projection({ kind: "decision", id: ID_B, title: huge, preview: huge }),
projection({ kind: "decision", id: ID_C, title: tiny, preview: tiny }),
];
lexical.hits = [
{ kind: "decision", id: ID_A, score: BM25Score.of(3.0) },
{ kind: "decision", id: ID_B, score: BM25Score.of(2.0) },
{ kind: "decision", id: ID_C, score: BM25Score.of(1.0) },
];

const result = await useCase.recall({
workspaceId: makeWorkspaceId(),
query: makeQuery(),
filters: makeFilters(5),
maxTokens: TokenBudget.withMax(500),
weights: RelevanceWeights.defaults(),
});

const entries = result.getEntries();
const ids = entries.map((e) => e.id);
expect(ids).toContain(ID_A);
expect(ids).toContain(ID_C);
expect(ids).not.toContain(ID_B);
expect(entries.length).toBe(2);
});

it("calls bumpUsage after a successful recall", async () => {
Expand Down
Loading