From c8a16f421a558ad6dbfb14de36abf00f8455b53b Mon Sep 17 00:00:00 2001 From: enitrat Date: Sun, 5 Oct 2025 10:49:49 +0200 Subject: [PATCH 1/2] =?UTF-8?q?-=20RecursiveMarkdownSplitter.ts=20=20=20-?= =?UTF-8?q?=20Added=20parsing=20of=20special=20Sources=20blocks=20delimite?= =?UTF-8?q?d=20by=20`---`=20=E2=80=A6=20`Sources:`=20=E2=80=A6=20`---`.=20?= =?UTF-8?q?=20=20-=20Computes=20active=20source=20ranges;=20assigns=20`chu?= =?UTF-8?q?nk.meta.sourceLink`=20to=20the=20first=20URL=20listed=20in=20th?= =?UTF-8?q?e=20most=20recent=20Sources=20block=20for=20all=20subsequent=20?= =?UTF-8?q?chunks=20until=20the=20next=20block.=20=20=20-=20Extended=20`Ch?= =?UTF-8?q?unkMeta`=20with=20optional=20`sourceLink`=20and=20tokenizer=20`?= =?UTF-8?q?Tokens`=20with=20`sourceRanges`.=20=20=20-=20Wired=20sourceRang?= =?UTF-8?q?es=20into=20metadata=20attachment=20without=20altering=20chunki?= =?UTF-8?q?ng=20behavior.=20-=20Tests=20=20=20-=20New=20test:=20`ingesters?= =?UTF-8?q?/src/utils/=5F=5Ftests=5F=5F/RecursiveMarkdownSplitter.sources.?= =?UTF-8?q?test.ts`=20validates=20activeSource=20mapping=20across=20multip?= =?UTF-8?q?le=20Sources=20blocks.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Robust fenced code detection - Accept up to 3 leading spaces before fences. - Track fence char and exact length; closing fence must match char and have length ≥ open, fence-only (no info string). - Fallback-close a malformed open block when a new opening fence appears while still in a block (marks previous as breakable). - Mark unclosed-at-EOF blocks as `breakable: true`. - Breakable blocks and size threshold - New `SplitOptions`: - `codeBlockMaxChars` (default: 2× `maxChars`) — closed blocks larger than this are treated as breakable (splittable). - `fallbackCloseOnNestedOpen` (default: true). - Splitting rules updated - Paragraph/line splitting skips split points only inside non-breakable blocks; allows splits inside breakable blocks. - Final boundary pass adjusts segment starts/ends only for non-breakable blocks; ensures monotonic, non-overlapping segments. - Overlap start is pushed past block end only if the enclosing block is non-breakable. - Tests - New: `ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts` enforces that no chunk starts inside a non-breakable code block; allows starts inside breakable (malformed/oversized) blocks. --- .../src/utils/RecursiveMarkdownSplitter.ts | 360 +++++++++++++----- ...ownSplitter.noStartInsideCodeBlock.test.ts | 60 +++ ...siveMarkdownSplitter.sourceOverlap.test.ts | 33 ++ .../RecursiveMarkdownSplitter.sources.test.ts | 58 +++ .../RecursiveMarkdownSplitter.test.ts | 20 + 5 files changed, 437 insertions(+), 94 deletions(-) create mode 100644 ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts create mode 100644 ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sourceOverlap.test.ts create mode 100644 ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts diff --git a/ingesters/src/utils/RecursiveMarkdownSplitter.ts b/ingesters/src/utils/RecursiveMarkdownSplitter.ts index 20b3da3..0541cb3 100644 --- a/ingesters/src/utils/RecursiveMarkdownSplitter.ts +++ b/ingesters/src/utils/RecursiveMarkdownSplitter.ts @@ -9,9 +9,13 @@ export interface SplitOptions { /** Characters of backward overlap between consecutive chunks. Default: 256 */ overlap?: number; /** Which header levels are allowed as primary split points. Default: [1, 2] */ - headerLevels?: (1 | 2)[]; + headerLevels?: (1 | 2 | 3)[]; /** If true, do not split inside fenced code blocks. Default: true */ preserveCodeBlocks?: boolean; + /** If a fenced code block exceeds this size, allow splitting inside it (fallback). Default: 2x maxChars */ + codeBlockMaxChars?: number; + /** If a new opening fence appears while one is open, fallback-close the previous block. Default: true */ + fallbackCloseOnNestedOpen?: boolean; /** Optional prefix for generated unique IDs */ idPrefix?: string; /** Whether to trim whitespace around chunks. Default: true */ @@ -30,6 +34,8 @@ export interface ChunkMeta { endChar: number; /** Full header path stack (e.g., ["Intro", "Goals"]) */ headerPath: string[]; + /** Optional source URL inferred from special "Sources" blocks */ + sourceLink?: string; } export interface Chunk { @@ -48,7 +54,10 @@ interface HeaderToken { interface CodeBlockToken { start: number; end: number; - fence: '```' | '~~~'; + fenceChar: '`' | '~'; + fenceLen: number; + closed: boolean; + breakable: boolean; infoString?: string; // e.g. "ts", "python" } @@ -60,6 +69,7 @@ interface Segment { interface Tokens { headers: HeaderToken[]; codeBlocks: CodeBlockToken[]; + sourceRanges: Array<{ start: number; end: number; url: string }>; } export class RecursiveMarkdownSplitter { @@ -72,6 +82,9 @@ export class RecursiveMarkdownSplitter { overlap: options.overlap ?? 256, headerLevels: options.headerLevels ?? [1, 2], preserveCodeBlocks: options.preserveCodeBlocks ?? true, + codeBlockMaxChars: + options.codeBlockMaxChars ?? (options.maxChars ?? 2048) * 2, + fallbackCloseOnNestedOpen: options.fallbackCloseOnNestedOpen ?? true, idPrefix: options.idPrefix ?? '', trim: options.trim ?? true, }; @@ -157,6 +170,7 @@ export class RecursiveMarkdownSplitter { nonEmptyChunks, normalizedMarkdown, tokens.headers, + tokens.sourceRanges, ); } @@ -166,6 +180,7 @@ export class RecursiveMarkdownSplitter { private tokenize(markdown: string): Tokens { const headers: HeaderToken[] = []; const codeBlocks: CodeBlockToken[] = []; + const sourceRanges = this.parseSourceRanges(markdown); // Find all headers const headerRegex = /^(#{1,6})\s+(.+?)(?:\s*#*)?$/gm; @@ -190,7 +205,90 @@ export class RecursiveMarkdownSplitter { ); }); - return { headers: filteredHeaders, codeBlocks }; + return { headers: filteredHeaders, codeBlocks, sourceRanges }; + } + + /** + * Parse special formatted Sources blocks and compute active source ranges + * A block looks like: + * ---\n + * Sources:\n + * - https://example.com/a\n + * - https://example.com/b\n + * --- + * Active source becomes the first URL and applies from the end of the block + * until the start of the next Sources block (or end of document). + */ + private parseSourceRanges(markdown: string): Array<{ start: number; end: number; url: string }> { + const lines = markdown.split('\n'); + const ranges: Array<{ start: number; end: number; url: string }> = []; + + // Build cumulative char index per line start + const lineStartIdx: number[] = new Array(lines.length); + let acc = 0; + for (let i = 0; i < lines.length; i++) { + lineStartIdx[i] = acc; + acc += lines[i]!.length + 1; // +1 for \n + } + + const isDashLine = (s: string) => /^\s*---\s*$/.test(s); + const isSourcesHeader = (s: string) => /^\s*Sources:\s*$/i.test(s); + const firstUrlInList = (startLine: number): string | undefined => { + for (let j = startLine; j < lines.length; j++) { + const l = lines[j]!; + if (isDashLine(l)) break; // stop at closing --- + const m = l.match(/^\s*[-*]\s+(\S+)/); + if (m) { + const url = m[1]!; + if (/^https?:\/\//i.test(url)) return url; + } + } + return undefined; + }; + + // Locate all source blocks (start/end + first URL) + const blocks: Array<{ blockStartLine: number; blockEndLine: number; firstUrl?: string }> = []; + for (let i = 0; i < lines.length; i++) { + if (!isDashLine(lines[i]!)) continue; + // Scan ahead for Sources: header within the dashed block + let j = i + 1; + // Skip blank lines + while (j < lines.length && /^\s*$/.test(lines[j]!)) j++; + if (j < lines.length && isSourcesHeader(lines[j]!)) { + // Find closing --- + let k = j + 1; + while (k < lines.length && !isDashLine(lines[k]!)) k++; + if (k < lines.length && isDashLine(lines[k]!)) { + const firstUrl = firstUrlInList(j + 1); + blocks.push({ blockStartLine: i, blockEndLine: k, firstUrl }); + i = k; // advance to end of block + } + } + } + + // Build ranges from blocks + if (blocks.length === 0) return ranges; + const docLen = markdown.length; + for (let b = 0; b < blocks.length; b++) { + const block = blocks[b]!; + const nextBlock = blocks[b + 1]; + + const blockEndLineIdx = block.blockEndLine; + const blockEndAbs = lineStartIdx[blockEndLineIdx] ?? 0; + const blockEndLineStr = lines[blockEndLineIdx] ?? ''; + const start = blockEndAbs + blockEndLineStr.length + 1; // after closing --- newline + + const nextStart = nextBlock + ? (lineStartIdx[nextBlock.blockStartLine] ?? docLen) + : docLen; + + const url = block.firstUrl || ''; + if (url && start < nextStart) { + ranges.push({ start, end: nextStart, url }); + } + } + + return ranges; } /** @@ -199,41 +297,104 @@ export class RecursiveMarkdownSplitter { private findCodeBlocks(markdown: string, codeBlocks: CodeBlockToken[]): void { const lines = markdown.split('\n'); let inCodeBlock = false; - let currentBlock: Partial | null = null; + let currentBlock: + | (Partial & { fenceChar: '`' | '~'; fenceLen: number }) + | null = null; let charIndex = 0; - for (let i = 0; i < lines.length; i++) { - const line = lines[i]; - const fenceMatch = line!.match(/^(```+|~~~+)(.*)$/); - - if (fenceMatch) { - const fence = fenceMatch[1]!.substring(0, 3) as '```' | '~~~'; + const openRe = /^\s{0,3}([`~]{3,})(.*)$/; // allow up to 3 leading spaces + const makeCloseRe = (ch: '`' | '~', n: number) => + new RegExp(`^\\s{0,3}(${ch === '`' ? '\\`' : '~'}{${n},})\\s*$`); - if (!inCodeBlock) { - // Starting a code block + for (let i = 0; i < lines.length; i++) { + const line = lines[i] as string; + const open = line.match(openRe); + + if (!inCodeBlock) { + if (open) { + const fenceStr = open[1] as string; + const ch = (fenceStr[0] === '`' ? '`' : '~') as '`' | '~'; + const len = fenceStr.length; + const info = (open[2] || '').trim() || undefined; inCodeBlock = true; currentBlock = { start: charIndex, - fence, - infoString: fenceMatch[2]!.trim() || undefined, + fenceChar: ch, + fenceLen: len, + infoString: info, + }; + } + } else { + const ch = currentBlock!.fenceChar; + const n = currentBlock!.fenceLen; + const closeRe = makeCloseRe(ch, n); + if (closeRe.test(line)) { + const end = charIndex + line.length; + const token: CodeBlockToken = { + start: currentBlock!.start!, + end, + fenceChar: ch, + fenceLen: n, + closed: true, + breakable: false, + infoString: currentBlock!.infoString, }; - } else if (currentBlock && line!.startsWith(currentBlock.fence!)) { - // Ending a code block - currentBlock.end = charIndex + line!.length; - codeBlocks.push(currentBlock as CodeBlockToken); + codeBlocks.push(token); inCodeBlock = false; currentBlock = null; + } else if (this.options.fallbackCloseOnNestedOpen && open) { + // Nested opening while open: fallback-close previous as malformed + const end = Math.max(0, charIndex - 1); + const malformed: CodeBlockToken = { + start: currentBlock!.start!, + end, + fenceChar: currentBlock!.fenceChar, + fenceLen: currentBlock!.fenceLen, + closed: false, + breakable: true, + infoString: currentBlock!.infoString, + }; + codeBlocks.push(malformed); + + // Start new block at current line + const fenceStr = open[1] as string; + const ch2 = (fenceStr[0] === '`' ? '`' : '~') as '`' | '~'; + const len2 = fenceStr.length; + const info2 = (open[2] || '').trim() || undefined; + currentBlock = { + start: charIndex, + fenceChar: ch2, + fenceLen: len2, + infoString: info2, + }; + inCodeBlock = true; } } - charIndex += line!.length + 1; // +1 for newline + charIndex += line.length + 1; } - // Handle unclosed code block if (currentBlock && inCodeBlock) { - logger.warn( - 'Unclosed code block detected, treating remaining content as plain text', - ); + logger.warn('Unclosed code block detected (EOF). Marking as breakable'); + const token: CodeBlockToken = { + start: currentBlock.start!, + end: markdown.length, + fenceChar: currentBlock.fenceChar, + fenceLen: currentBlock.fenceLen, + closed: false, + breakable: true, + infoString: currentBlock.infoString, + }; + codeBlocks.push(token); + } + + // Set breakable on large closed blocks + const maxSize = this.options.codeBlockMaxChars ?? this.options.maxChars * 2; + for (const b of codeBlocks) { + if (b.closed) { + const size = b.end - b.start; + if (size > maxSize) b.breakable = true; + } } } @@ -313,7 +474,7 @@ export class RecursiveMarkdownSplitter { (h) => h.start >= segment.start && h.end <= segment.end && - this.options.headerLevels.includes(h.level as 1 | 2), + this.options.headerLevels.includes(h.level as 1 | 2 | 3), ); if (segmentHeaders.length === 0) { @@ -391,9 +552,9 @@ export class RecursiveMarkdownSplitter { // Collect all valid split points while ((match = paragraphRegex.exec(segmentText)) !== null) { - const splitPoint = segment.start + match.index + match[0].length; - // Check if split point is inside a code block - if (!this.isInsideCodeBlock(splitPoint, codeBlocks)) { + const splitPointAbs = segment.start + match.index + match[0].length; + const enclosing = this.getEnclosingCodeBlock(splitPointAbs, codeBlocks); + if (!enclosing || enclosing.breakable) { splitPoints.push(match.index + match[0].length); } } @@ -442,7 +603,8 @@ export class RecursiveMarkdownSplitter { currentLength > 0 ) { // Check if we can split here - if (!this.isInsideCodeBlock(lineStart, codeBlocks)) { + const enclosing = this.getEnclosingCodeBlock(lineStart, codeBlocks); + if (!enclosing || enclosing.breakable) { segments.push({ start: currentStart, end: lineStart, @@ -477,9 +639,17 @@ export class RecursiveMarkdownSplitter { position: number, codeBlocks: CodeBlockToken[], ): boolean { - return codeBlocks.some( - (block) => position >= block.start && position < block.end, - ); + return this.getEnclosingCodeBlock(position, codeBlocks) !== null; + } + + private getEnclosingCodeBlock( + position: number, + codeBlocks: CodeBlockToken[], + ): CodeBlockToken | null { + for (const block of codeBlocks) { + if (position > block.start && position < block.end) return block; + } + return null; } /** @@ -550,24 +720,40 @@ export class RecursiveMarkdownSplitter { } } - // Final pass: ensure no segment ends in the middle of a code block + // Final pass: adjust boundaries so that no segment starts or ends inside a non-breakable code block, + // and ensure segments are non-overlapping and ordered. const finalSegments: Segment[] = []; + let prevEnd: number | null = null; + for (const segment of mergedSegments) { - let adjustedEnd = segment.end; + let start = segment.start; + let end = segment.end; - // Check if segment end is inside a code block + // If end falls inside a non-breakable code block, advance it to the block end for (const block of codeBlocks) { - if (segment.end > block.start && segment.end < block.end) { - // Extend to include the entire code block - adjustedEnd = block.end; + if (end > block.start && end < block.end && !block.breakable) { + end = block.end; break; } } - finalSegments.push({ - start: segment.start, - end: adjustedEnd, - }); + // If start falls inside a non-breakable code block, move start to the block end + for (const block of codeBlocks) { + if (start > block.start && start < block.end && !block.breakable) { + start = block.end; + break; + } + } + + // Ensure monotonic, non-overlapping segments + if (prevEnd !== null && start < prevEnd) { + start = prevEnd; + } + + if (start < end) { + finalSegments.push({ start, end }); + prevEnd = end; + } } return finalSegments; @@ -597,56 +783,31 @@ export class RecursiveMarkdownSplitter { for (let i = 0; i < segments.length; i++) { const segment = segments[i]!; - let content = markdown.slice(segment.start, segment.end); - let chunkStart = segment.start; - // For chunks after the first, prepend overlap from previous segment + // Compute absolute start with overlap, but never start inside a non-breakable code block + let chunkStartAbs = segment.start; if (i > 0 && this.options.overlap > 0) { const prevSegment = segments[i - 1]!; - const prevContent = markdown.slice(prevSegment.start, prevSegment.end); - - // Calculate how much overlap to take from the previous segment - const overlapLength = Math.min( - this.options.overlap, - prevContent.length, + const desired = Math.max( + prevSegment.end - Math.min(this.options.overlap, prevSegment.end - prevSegment.start), + prevSegment.start, ); - let overlapStart = prevContent.length - overlapLength; - - // Check if the overlap would start in the middle of a code block - const overlapAbsoluteStart = prevSegment.start + overlapStart; - for (const block of codeBlocks) { - if ( - overlapAbsoluteStart > block.start && - overlapAbsoluteStart < block.end - ) { - // Overlap would start inside a code block - if (block.end <= prevSegment.end) { - // The code block ends within the previous segment - // Start overlap after the code block to avoid duplication - const blockEndInSegment = block.end - prevSegment.start; - if (blockEndInSegment < prevContent.length) { - overlapStart = blockEndInSegment; - } - } - break; - } + chunkStartAbs = desired; + const enclosing = this.getEnclosingCodeBlock(chunkStartAbs, codeBlocks); + if (enclosing && !enclosing.breakable) { + // Move start to end of the enclosing non-breakable block + chunkStartAbs = enclosing.end; } - - // Extract overlap text from the adjusted position - const overlapText = prevContent.slice(overlapStart); - - // Prepend overlap to current content - content = overlapText + content; - - // Track where the actual content starts (including overlap) - chunkStart = prevSegment.start + overlapStart; } + // Extract content from chunkStartAbs to segment.end + let content = markdown.slice(chunkStartAbs, segment.end); + chunks.push({ content: this.options.trim ? content.trim() : content, - start: chunkStart, // Now reflects the actual start including overlap + start: chunkStartAbs, end: segment.end, - overlapStart: i > 0 ? segment.start : undefined, // Original segment start for reference + overlapStart: i > 0 ? segment.start : undefined, }); } @@ -660,6 +821,7 @@ export class RecursiveMarkdownSplitter { rawChunks: Array<{ content: string; start: number; end: number }>, markdown: string, headers: HeaderToken[], + sourceRanges?: Array<{ start: number; end: number; url: string }>, ): Chunk[] { const chunks: Chunk[] = []; const titleCounts = new Map(); @@ -686,22 +848,16 @@ export class RecursiveMarkdownSplitter { headerPath = headerStack.map((h) => h.text); - // Find title from configured levels - check headers within the chunk first - const headersInChunk = headers.filter( - (h) => - h.start >= rawChunk.start && - h.start < rawChunk.end && - this.options.headerLevels.includes(h.level as 1 | 2), - ); - - if (headersInChunk.length > 0) { - // Use the first configured header within the chunk - title = headersInChunk[0]!.text; + // Prefer the deepest header in the path (e.g., H3) for specificity + if (headerPath.length > 0) { + title = headerPath[headerPath.length - 1]!; } else { - // Otherwise, use the last configured header before the chunk + // Fallback: use last configured header before the chunk if any for (let i = headerStack.length - 1; i >= 0; i--) { if ( - this.options.headerLevels.includes(headerStack[i]!.level as 1 | 2) + this.options.headerLevels.includes( + headerStack[i]!.level as 1 | 2 | 3, + ) ) { title = headerStack[i]!.text; break; @@ -719,6 +875,21 @@ export class RecursiveMarkdownSplitter { ? `${this.options.idPrefix}-${slug}-${count}` : `${slug}-${count}`; + // Determine sourceLink based on active source ranges: prefer segment start (no overlap) + let sourceLink: string | undefined = undefined; + const anchorPos = (rawChunk as any).overlapStart ?? rawChunk.start; + if (sourceRanges && sourceRanges.length > 0) { + const s = anchorPos as number; + for (const r of sourceRanges) { + if (s >= r.start && s < r.end) { + sourceLink = r.url; + break; + } + } + } + + console.log(`Chunk Title: ${title}, Source link: ${sourceLink}`); + chunks.push({ content: rawChunk.content, meta: { @@ -728,6 +899,7 @@ export class RecursiveMarkdownSplitter { startChar: rawChunk.start, endChar: rawChunk.end, headerPath, + sourceLink, }, }); } diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts new file mode 100644 index 0000000..d3e815b --- /dev/null +++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts @@ -0,0 +1,60 @@ +import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter'; + +function getCodeBlockRanges(text: string): Array<{ start: number; end: number }> { + const ranges: Array<{ start: number; end: number }> = []; + const re = /```[\s\S]*?```/g; + let m: RegExpExecArray | null; + while ((m = re.exec(text)) !== null) { + ranges.push({ start: m.index, end: m.index + m[0]!.length }); + } + return ranges; +} + +function isInside(pos: number, ranges: Array<{ start: number; end: number }>): boolean { + return ranges.some((r) => pos > r.start && pos < r.end); +} + +describe('RecursiveMarkdownSplitter - No chunk starts inside code block', () => { + it('ensures chunk starts are never within fenced code blocks even with overlap', () => { + const longCode = Array.from({ length: 60 }, (_, i) => `line ${i} of code`).join('\n'); + const md = `# Section One\n\nIntro paragraph text that will be part of the first section.\n\n\n## Subsection\n\nSome text before a large code block.\n\n +\`\`\`cairo +fn initializer(ref self: ContractState, owner: ContractAddress) { + // example + let x = 0;\n${longCode} +} +\`\`\` + +After the code block there is trailing text to encourage multiple segments and overlap across chunk boundaries. This text continues for a while to ensure we have a next chunk that might try to overlap into the previous code block.`; + + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 200, + minChars: 0, + overlap: 50, + headerLevels: [1, 2], + preserveCodeBlocks: true, + trim: false, + }); + + const chunks = splitter.splitMarkdownToChunks(md); + const ranges = getCodeBlockRanges(md); + + // Assert: No chunk start lies strictly inside any NON-breakable fenced code block + // Breakable threshold mirrors splitter default: 2x maxChars = 400 + const codeBlockMaxChars = 400; + for (const c of chunks) { + const pos = c.meta.startChar; + const insideRanges = ranges.filter((r) => pos > r.start && pos < r.end); + if (insideRanges.length === 0) continue; + // If inside a code block, only allow if that block is oversized (breakable) + const smallest = insideRanges.reduce((acc, r) => { + if (!acc) return r; + const accLen = acc.end - acc.start; + const rLen = r.end - r.start; + return rLen < accLen ? r : acc; + }, insideRanges[0]!); + const len = (smallest?.end ?? 0) - (smallest?.start ?? 0); + expect(len).toBeGreaterThan(codeBlockMaxChars); + } + }); +}); diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sourceOverlap.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sourceOverlap.test.ts new file mode 100644 index 0000000..5de377b --- /dev/null +++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sourceOverlap.test.ts @@ -0,0 +1,33 @@ +import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter'; + +describe('RecursiveMarkdownSplitter - sourceLink mapping with overlap', () => { + it('should use segment start (no-overlap) to resolve sourceLink so it is never undefined', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 60, + minChars: 0, + overlap: 20, + headerLevels: [1, 2, 3], + }); + + const md = `--- + +Sources: + +- https://example.com/a + +--- + +# Title + +Paragraph one is long enough to cause splitting when combined with overlap. This ensures chunk starts may fall before the source range while the segment starts after it.`; + + const chunks = splitter.splitMarkdownToChunks(md); + expect(chunks.length).toBeGreaterThan(1); + // All non-ROOT chunks (after first header) should have a sourceLink + for (const c of chunks) { + if (c.meta.title !== 'ROOT') { + expect(c.meta.sourceLink).toBe('https://example.com/a'); + } + } + }); +}); diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts new file mode 100644 index 0000000..01f1b4b --- /dev/null +++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts @@ -0,0 +1,58 @@ +import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter'; + +describe('RecursiveMarkdownSplitter - Sources block activeSource mapping', () => { + it('assigns sourceLink from first URL in Sources block to subsequent chunks', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 120, + minChars: 0, + overlap: 0, + headerLevels: [1, 2], + }); + + const text = ` + --- + +Sources: + +- https://www.starknet.io/cairo-book/ch00-00-introduction.html +- https://www.starknet.io/cairo-book/ch00-01-foreword.html + +--- + +# The Cairo Book: Introduction and Learning Resources + +Some introduction text. + +--- + +Sources: + +- https://www.starknet.io/cairo-book/ + +--- + +## About The Cairo Book + +More details here.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Find chunk under the first H1 + const introChunk = chunks.find((c) => + c.content.includes('# The Cairo Book: Introduction and Learning Resources'), + ); + expect(introChunk).toBeDefined(); + expect(introChunk!.meta.sourceLink).toBe( + 'https://www.starknet.io/cairo-book/ch00-00-introduction.html', + ); + + // Find chunk under the second header (H2), after second Sources block + const aboutChunk = chunks.find((c) => + c.content.includes('## About The Cairo Book'), + ); + expect(aboutChunk).toBeDefined(); + expect(aboutChunk!.meta.sourceLink).toBe( + 'https://www.starknet.io/cairo-book/', + ); + }); +}); diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts index a7ac549..ca8ba7f 100644 --- a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts +++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts @@ -123,6 +123,26 @@ More content.`; expect(chunks[0]!.meta.title).toBe('Header with trailing hashes'); }); + + it('should prefer deepest header (e.g., H3) for title', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 80, + minChars: 0, + overlap: 0, + headerLevels: [1, 2], // split only on H1/H2, but titles should use deepest header in path + }); + + const text = `# Chapter +Intro + +### Specific Topic +Detailed text that should belong to the H3.`; + + const chunks = splitter.splitMarkdownToChunks(text); + expect(chunks.length).toBeGreaterThan(0); + // Title should be the deepest header in headerPath -> H3 + expect(chunks[0]!.meta.title).toBe('Specific Topic'); + }); }); describe('Code block handling', () => { From 224d45029d4dfb2910cef74b71776ea718ecc5aa Mon Sep 17 00:00:00 2001 From: enitrat Date: Sun, 5 Oct 2025 10:55:20 +0200 Subject: [PATCH 2/2] fmt --- ingesters/src/utils/RecursiveMarkdownSplitter.ts | 13 ++++++++++--- ...MarkdownSplitter.noStartInsideCodeBlock.test.ts | 14 +++++++++++--- .../RecursiveMarkdownSplitter.sources.test.ts | 4 +++- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/ingesters/src/utils/RecursiveMarkdownSplitter.ts b/ingesters/src/utils/RecursiveMarkdownSplitter.ts index 0541cb3..fcbc8a0 100644 --- a/ingesters/src/utils/RecursiveMarkdownSplitter.ts +++ b/ingesters/src/utils/RecursiveMarkdownSplitter.ts @@ -219,7 +219,9 @@ export class RecursiveMarkdownSplitter { * Active source becomes the first URL and applies from the end of the block * until the start of the next Sources block (or end of document). */ - private parseSourceRanges(markdown: string): Array<{ start: number; end: number; url: string }> { + private parseSourceRanges( + markdown: string, + ): Array<{ start: number; end: number; url: string }> { const lines = markdown.split('\n'); const ranges: Array<{ start: number; end: number; url: string }> = []; @@ -247,7 +249,11 @@ export class RecursiveMarkdownSplitter { }; // Locate all source blocks (start/end + first URL) - const blocks: Array<{ blockStartLine: number; blockEndLine: number; firstUrl?: string }> = []; + const blocks: Array<{ + blockStartLine: number; + blockEndLine: number; + firstUrl?: string; + }> = []; for (let i = 0; i < lines.length; i++) { if (!isDashLine(lines[i]!)) continue; // Scan ahead for Sources: header within the dashed block @@ -789,7 +795,8 @@ export class RecursiveMarkdownSplitter { if (i > 0 && this.options.overlap > 0) { const prevSegment = segments[i - 1]!; const desired = Math.max( - prevSegment.end - Math.min(this.options.overlap, prevSegment.end - prevSegment.start), + prevSegment.end - + Math.min(this.options.overlap, prevSegment.end - prevSegment.start), prevSegment.start, ); chunkStartAbs = desired; diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts index d3e815b..08eeef2 100644 --- a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts +++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts @@ -1,6 +1,8 @@ import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter'; -function getCodeBlockRanges(text: string): Array<{ start: number; end: number }> { +function getCodeBlockRanges( + text: string, +): Array<{ start: number; end: number }> { const ranges: Array<{ start: number; end: number }> = []; const re = /```[\s\S]*?```/g; let m: RegExpExecArray | null; @@ -10,13 +12,19 @@ function getCodeBlockRanges(text: string): Array<{ start: number; end: number }> return ranges; } -function isInside(pos: number, ranges: Array<{ start: number; end: number }>): boolean { +function isInside( + pos: number, + ranges: Array<{ start: number; end: number }>, +): boolean { return ranges.some((r) => pos > r.start && pos < r.end); } describe('RecursiveMarkdownSplitter - No chunk starts inside code block', () => { it('ensures chunk starts are never within fenced code blocks even with overlap', () => { - const longCode = Array.from({ length: 60 }, (_, i) => `line ${i} of code`).join('\n'); + const longCode = Array.from( + { length: 60 }, + (_, i) => `line ${i} of code`, + ).join('\n'); const md = `# Section One\n\nIntro paragraph text that will be part of the first section.\n\n\n## Subsection\n\nSome text before a large code block.\n\n \`\`\`cairo fn initializer(ref self: ContractState, owner: ContractAddress) { diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts index 01f1b4b..fd67e6c 100644 --- a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts +++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts @@ -39,7 +39,9 @@ More details here.`; // Find chunk under the first H1 const introChunk = chunks.find((c) => - c.content.includes('# The Cairo Book: Introduction and Learning Resources'), + c.content.includes( + '# The Cairo Book: Introduction and Learning Resources', + ), ); expect(introChunk).toBeDefined(); expect(introChunk!.meta.sourceLink).toBe(