From c8a16f421a558ad6dbfb14de36abf00f8455b53b Mon Sep 17 00:00:00 2001
From: enitrat <msaug@protonmail.com>
Date: Sun, 5 Oct 2025 10:49:49 +0200
Subject: [PATCH 1/2] =?UTF-8?q?-=20RecursiveMarkdownSplitter.ts=20=20=20-?=
 =?UTF-8?q?=20Added=20parsing=20of=20special=20Sources=20blocks=20delimite?=
 =?UTF-8?q?d=20by=20`---`=20=E2=80=A6=20`Sources:`=20=E2=80=A6=20`---`.=20?=
 =?UTF-8?q?=20=20-=20Computes=20active=20source=20ranges;=20assigns=20`chu?=
 =?UTF-8?q?nk.meta.sourceLink`=20to=20the=20first=20URL=20listed=20in=20th?=
 =?UTF-8?q?e=20most=20recent=20Sources=20block=20for=20all=20subsequent=20?=
 =?UTF-8?q?chunks=20until=20the=20next=20block.=20=20=20-=20Extended=20`Ch?=
 =?UTF-8?q?unkMeta`=20with=20optional=20`sourceLink`=20and=20tokenizer=20`?=
 =?UTF-8?q?Tokens`=20with=20`sourceRanges`.=20=20=20-=20Wired=20sourceRang?=
 =?UTF-8?q?es=20into=20metadata=20attachment=20without=20altering=20chunki?=
 =?UTF-8?q?ng=20behavior.=20-=20Tests=20=20=20-=20New=20test:=20`ingesters?=
 =?UTF-8?q?/src/utils/=5F=5Ftests=5F=5F/RecursiveMarkdownSplitter.sources.?=
 =?UTF-8?q?test.ts`=20validates=20activeSource=20mapping=20across=20multip?=
 =?UTF-8?q?le=20Sources=20blocks.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Robust fenced code detection
  - Accept up to 3 leading spaces before fences.
  - Track fence char and exact length; closing fence must match char and have length ≥ open, fence-only (no info string).
  - Fallback-close a malformed open block when a new opening fence appears while still in a block (marks previous as breakable).
  - Mark unclosed-at-EOF blocks as `breakable: true`.
- Breakable blocks and size threshold
  - New `SplitOptions`:
    - `codeBlockMaxChars` (default: 2× `maxChars`) — closed blocks larger than this are treated as breakable (splittable).
    - `fallbackCloseOnNestedOpen` (default: true).
- Splitting rules updated
  - Paragraph/line splitting skips split points only inside non-breakable blocks; allows splits inside breakable blocks.
  - Final boundary pass adjusts segment starts/ends only for non-breakable blocks; ensures monotonic, non-overlapping segments.
  - Overlap start is pushed past block end only if the enclosing block is non-breakable.
- Tests
  - New: `ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts` enforces that no chunk starts inside a non-breakable code block; allows starts inside breakable (malformed/oversized) blocks.
---
 .../src/utils/RecursiveMarkdownSplitter.ts    | 360 +++++++++++++-----
 ...ownSplitter.noStartInsideCodeBlock.test.ts |  60 +++
 ...siveMarkdownSplitter.sourceOverlap.test.ts |  33 ++
 .../RecursiveMarkdownSplitter.sources.test.ts |  58 +++
 .../RecursiveMarkdownSplitter.test.ts         |  20 +
 5 files changed, 437 insertions(+), 94 deletions(-)
 create mode 100644 ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts
 create mode 100644 ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sourceOverlap.test.ts
 create mode 100644 ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts

diff --git a/ingesters/src/utils/RecursiveMarkdownSplitter.ts b/ingesters/src/utils/RecursiveMarkdownSplitter.ts
index 20b3da3..0541cb3 100644
--- a/ingesters/src/utils/RecursiveMarkdownSplitter.ts
+++ b/ingesters/src/utils/RecursiveMarkdownSplitter.ts
@@ -9,9 +9,13 @@ export interface SplitOptions {
   /** Characters of backward overlap between consecutive chunks. Default: 256 */
   overlap?: number;
   /** Which header levels are allowed as primary split points. Default: [1, 2] */
-  headerLevels?: (1 | 2)[];
+  headerLevels?: (1 | 2 | 3)[];
   /** If true, do not split inside fenced code blocks. Default: true */
   preserveCodeBlocks?: boolean;
+  /** If a fenced code block exceeds this size, allow splitting inside it (fallback). Default: 2x maxChars */
+  codeBlockMaxChars?: number;
+  /** If a new opening fence appears while one is open, fallback-close the previous block. Default: true */
+  fallbackCloseOnNestedOpen?: boolean;
   /** Optional prefix for generated unique IDs */
   idPrefix?: string;
   /** Whether to trim whitespace around chunks. Default: true */
@@ -30,6 +34,8 @@ export interface ChunkMeta {
   endChar: number;
   /** Full header path stack (e.g., ["Intro", "Goals"]) */
   headerPath: string[];
+  /** Optional source URL inferred from special "Sources" blocks */
+  sourceLink?: string;
 }
 
 export interface Chunk {
@@ -48,7 +54,10 @@ interface HeaderToken {
 interface CodeBlockToken {
   start: number;
   end: number;
-  fence: '```' | '~~~';
+  fenceChar: '`' | '~';
+  fenceLen: number;
+  closed: boolean;
+  breakable: boolean;
   infoString?: string; // e.g. "ts", "python"
 }
 
@@ -60,6 +69,7 @@ interface Segment {
 interface Tokens {
   headers: HeaderToken[];
   codeBlocks: CodeBlockToken[];
+  sourceRanges: Array<{ start: number; end: number; url: string }>;
 }
 
 export class RecursiveMarkdownSplitter {
@@ -72,6 +82,9 @@ export class RecursiveMarkdownSplitter {
       overlap: options.overlap ?? 256,
       headerLevels: options.headerLevels ?? [1, 2],
       preserveCodeBlocks: options.preserveCodeBlocks ?? true,
+      codeBlockMaxChars:
+        options.codeBlockMaxChars ?? (options.maxChars ?? 2048) * 2,
+      fallbackCloseOnNestedOpen: options.fallbackCloseOnNestedOpen ?? true,
       idPrefix: options.idPrefix ?? '',
       trim: options.trim ?? true,
     };
@@ -157,6 +170,7 @@ export class RecursiveMarkdownSplitter {
       nonEmptyChunks,
       normalizedMarkdown,
       tokens.headers,
+      tokens.sourceRanges,
     );
   }
 
@@ -166,6 +180,7 @@ export class RecursiveMarkdownSplitter {
   private tokenize(markdown: string): Tokens {
     const headers: HeaderToken[] = [];
     const codeBlocks: CodeBlockToken[] = [];
+    const sourceRanges = this.parseSourceRanges(markdown);
 
     // Find all headers
     const headerRegex = /^(#{1,6})\s+(.+?)(?:\s*#*)?$/gm;
@@ -190,7 +205,90 @@ export class RecursiveMarkdownSplitter {
       );
     });
 
-    return { headers: filteredHeaders, codeBlocks };
+    return { headers: filteredHeaders, codeBlocks, sourceRanges };
+  }
+
+  /**
+   * Parse special formatted Sources blocks and compute active source ranges
+   * A block looks like:
+   * ---\n
+   * Sources:\n
+   * - https://example.com/a\n
+   * - https://example.com/b\n
+   * ---
+   * Active source becomes the first URL and applies from the end of the block
+   * until the start of the next Sources block (or end of document).
+   */
+  private parseSourceRanges(markdown: string): Array<{ start: number; end: number; url: string }> {
+    const lines = markdown.split('\n');
+    const ranges: Array<{ start: number; end: number; url: string }> = [];
+
+    // Build cumulative char index per line start
+    const lineStartIdx: number[] = new Array(lines.length);
+    let acc = 0;
+    for (let i = 0; i < lines.length; i++) {
+      lineStartIdx[i] = acc;
+      acc += lines[i]!.length + 1; // +1 for \n
+    }
+
+    const isDashLine = (s: string) => /^\s*---\s*$/.test(s);
+    const isSourcesHeader = (s: string) => /^\s*Sources:\s*$/i.test(s);
+    const firstUrlInList = (startLine: number): string | undefined => {
+      for (let j = startLine; j < lines.length; j++) {
+        const l = lines[j]!;
+        if (isDashLine(l)) break; // stop at closing ---
+        const m = l.match(/^\s*[-*]\s+(\S+)/);
+        if (m) {
+          const url = m[1]!;
+          if (/^https?:\/\//i.test(url)) return url;
+        }
+      }
+      return undefined;
+    };
+
+    // Locate all source blocks (start/end + first URL)
+    const blocks: Array<{ blockStartLine: number; blockEndLine: number; firstUrl?: string }> = [];
+    for (let i = 0; i < lines.length; i++) {
+      if (!isDashLine(lines[i]!)) continue;
+      // Scan ahead for Sources: header within the dashed block
+      let j = i + 1;
+      // Skip blank lines
+      while (j < lines.length && /^\s*$/.test(lines[j]!)) j++;
+      if (j < lines.length && isSourcesHeader(lines[j]!)) {
+        // Find closing ---
+        let k = j + 1;
+        while (k < lines.length && !isDashLine(lines[k]!)) k++;
+        if (k < lines.length && isDashLine(lines[k]!)) {
+          const firstUrl = firstUrlInList(j + 1);
+          blocks.push({ blockStartLine: i, blockEndLine: k, firstUrl });
+          i = k; // advance to end of block
+        }
+      }
+    }
+
+    // Build ranges from blocks
+    if (blocks.length === 0) return ranges;
+    const docLen = markdown.length;
+    for (let b = 0; b < blocks.length; b++) {
+      const block = blocks[b]!;
+      const nextBlock = blocks[b + 1];
+
+      const blockEndLineIdx = block.blockEndLine;
+      const blockEndAbs = lineStartIdx[blockEndLineIdx] ?? 0;
+      const blockEndLineStr = lines[blockEndLineIdx] ?? '';
+      const start = blockEndAbs + blockEndLineStr.length + 1; // after closing --- newline
+
+      const nextStart = nextBlock
+        ? (lineStartIdx[nextBlock.blockStartLine] ?? docLen)
+        : docLen;
+
+      const url = block.firstUrl || '';
+      if (url && start < nextStart) {
+        ranges.push({ start, end: nextStart, url });
+      }
+    }
+
+    return ranges;
   }
 
   /**
@@ -199,41 +297,104 @@ export class RecursiveMarkdownSplitter {
   private findCodeBlocks(markdown: string, codeBlocks: CodeBlockToken[]): void {
     const lines = markdown.split('\n');
     let inCodeBlock = false;
-    let currentBlock: Partial<CodeBlockToken> | null = null;
+    let currentBlock:
+      | (Partial<CodeBlockToken> & { fenceChar: '`' | '~'; fenceLen: number })
+      | null = null;
     let charIndex = 0;
 
-    for (let i = 0; i < lines.length; i++) {
-      const line = lines[i];
-      const fenceMatch = line!.match(/^(```+|~~~+)(.*)$/);
-
-      if (fenceMatch) {
-        const fence = fenceMatch[1]!.substring(0, 3) as '```' | '~~~';
+    const openRe = /^\s{0,3}([`~]{3,})(.*)$/; // allow up to 3 leading spaces
+    const makeCloseRe = (ch: '`' | '~', n: number) =>
+      new RegExp(`^\\s{0,3}(${ch === '`' ? '\\`' : '~'}{${n},})\\s*$`);
 
-        if (!inCodeBlock) {
-          // Starting a code block
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i] as string;
+      const open = line.match(openRe);
+
+      if (!inCodeBlock) {
+        if (open) {
+          const fenceStr = open[1] as string;
+          const ch = (fenceStr[0] === '`' ? '`' : '~') as '`' | '~';
+          const len = fenceStr.length;
+          const info = (open[2] || '').trim() || undefined;
           inCodeBlock = true;
           currentBlock = {
             start: charIndex,
-            fence,
-            infoString: fenceMatch[2]!.trim() || undefined,
+            fenceChar: ch,
+            fenceLen: len,
+            infoString: info,
+          };
+        }
+      } else {
+        const ch = currentBlock!.fenceChar;
+        const n = currentBlock!.fenceLen;
+        const closeRe = makeCloseRe(ch, n);
+        if (closeRe.test(line)) {
+          const end = charIndex + line.length;
+          const token: CodeBlockToken = {
+            start: currentBlock!.start!,
+            end,
+            fenceChar: ch,
+            fenceLen: n,
+            closed: true,
+            breakable: false,
+            infoString: currentBlock!.infoString,
           };
-        } else if (currentBlock && line!.startsWith(currentBlock.fence!)) {
-          // Ending a code block
-          currentBlock.end = charIndex + line!.length;
-          codeBlocks.push(currentBlock as CodeBlockToken);
+          codeBlocks.push(token);
           inCodeBlock = false;
           currentBlock = null;
+        } else if (this.options.fallbackCloseOnNestedOpen && open) {
+          // Nested opening while open: fallback-close previous as malformed
+          const end = Math.max(0, charIndex - 1);
+          const malformed: CodeBlockToken = {
+            start: currentBlock!.start!,
+            end,
+            fenceChar: currentBlock!.fenceChar,
+            fenceLen: currentBlock!.fenceLen,
+            closed: false,
+            breakable: true,
+            infoString: currentBlock!.infoString,
+          };
+          codeBlocks.push(malformed);
+
+          // Start new block at current line
+          const fenceStr = open[1] as string;
+          const ch2 = (fenceStr[0] === '`' ? '`' : '~') as '`' | '~';
+          const len2 = fenceStr.length;
+          const info2 = (open[2] || '').trim() || undefined;
+          currentBlock = {
+            start: charIndex,
+            fenceChar: ch2,
+            fenceLen: len2,
+            infoString: info2,
+          };
+          inCodeBlock = true;
         }
       }
 
-      charIndex += line!.length + 1; // +1 for newline
+      charIndex += line.length + 1;
     }
 
-    // Handle unclosed code block
     if (currentBlock && inCodeBlock) {
-      logger.warn(
-        'Unclosed code block detected, treating remaining content as plain text',
-      );
+      logger.warn('Unclosed code block detected (EOF). Marking as breakable');
+      const token: CodeBlockToken = {
+        start: currentBlock.start!,
+        end: markdown.length,
+        fenceChar: currentBlock.fenceChar,
+        fenceLen: currentBlock.fenceLen,
+        closed: false,
+        breakable: true,
+        infoString: currentBlock.infoString,
+      };
+      codeBlocks.push(token);
+    }
+
+    // Set breakable on large closed blocks
+    const maxSize = this.options.codeBlockMaxChars ?? this.options.maxChars * 2;
+    for (const b of codeBlocks) {
+      if (b.closed) {
+        const size = b.end - b.start;
+        if (size > maxSize) b.breakable = true;
+      }
     }
   }
 
@@ -313,7 +474,7 @@ export class RecursiveMarkdownSplitter {
       (h) =>
         h.start >= segment.start &&
         h.end <= segment.end &&
-        this.options.headerLevels.includes(h.level as 1 | 2),
+        this.options.headerLevels.includes(h.level as 1 | 2 | 3),
     );
 
     if (segmentHeaders.length === 0) {
@@ -391,9 +552,9 @@ export class RecursiveMarkdownSplitter {
 
     // Collect all valid split points
     while ((match = paragraphRegex.exec(segmentText)) !== null) {
-      const splitPoint = segment.start + match.index + match[0].length;
-      // Check if split point is inside a code block
-      if (!this.isInsideCodeBlock(splitPoint, codeBlocks)) {
+      const splitPointAbs = segment.start + match.index + match[0].length;
+      const enclosing = this.getEnclosingCodeBlock(splitPointAbs, codeBlocks);
+      if (!enclosing || enclosing.breakable) {
         splitPoints.push(match.index + match[0].length);
       }
     }
@@ -442,7 +603,8 @@ export class RecursiveMarkdownSplitter {
         currentLength > 0
       ) {
         // Check if we can split here
-        if (!this.isInsideCodeBlock(lineStart, codeBlocks)) {
+        const enclosing = this.getEnclosingCodeBlock(lineStart, codeBlocks);
+        if (!enclosing || enclosing.breakable) {
           segments.push({
             start: currentStart,
             end: lineStart,
@@ -477,9 +639,17 @@ export class RecursiveMarkdownSplitter {
     position: number,
     codeBlocks: CodeBlockToken[],
   ): boolean {
-    return codeBlocks.some(
-      (block) => position >= block.start && position < block.end,
-    );
+    return this.getEnclosingCodeBlock(position, codeBlocks) !== null;
+  }
+
+  private getEnclosingCodeBlock(
+    position: number,
+    codeBlocks: CodeBlockToken[],
+  ): CodeBlockToken | null {
+    for (const block of codeBlocks) {
+      if (position > block.start && position < block.end) return block;
+    }
+    return null;
   }
 
   /**
@@ -550,24 +720,40 @@ export class RecursiveMarkdownSplitter {
       }
     }
 
-    // Final pass: ensure no segment ends in the middle of a code block
+    // Final pass: adjust boundaries so that no segment starts or ends inside a non-breakable code block,
+    // and ensure segments are non-overlapping and ordered.
     const finalSegments: Segment[] = [];
+    let prevEnd: number | null = null;
+
     for (const segment of mergedSegments) {
-      let adjustedEnd = segment.end;
+      let start = segment.start;
+      let end = segment.end;
 
-      // Check if segment end is inside a code block
+      // If end falls inside a non-breakable code block, advance it to the block end
       for (const block of codeBlocks) {
-        if (segment.end > block.start && segment.end < block.end) {
-          // Extend to include the entire code block
-          adjustedEnd = block.end;
+        if (end > block.start && end < block.end && !block.breakable) {
+          end = block.end;
           break;
         }
       }
 
-      finalSegments.push({
-        start: segment.start,
-        end: adjustedEnd,
-      });
+      // If start falls inside a non-breakable code block, move start to the block end
+      for (const block of codeBlocks) {
+        if (start > block.start && start < block.end && !block.breakable) {
+          start = block.end;
+          break;
+        }
+      }
+
+      // Ensure monotonic, non-overlapping segments
+      if (prevEnd !== null && start < prevEnd) {
+        start = prevEnd;
+      }
+
+      if (start < end) {
+        finalSegments.push({ start, end });
+        prevEnd = end;
+      }
     }
 
     return finalSegments;
@@ -597,56 +783,31 @@ export class RecursiveMarkdownSplitter {
 
     for (let i = 0; i < segments.length; i++) {
       const segment = segments[i]!;
-      let content = markdown.slice(segment.start, segment.end);
-      let chunkStart = segment.start;
 
-      // For chunks after the first, prepend overlap from previous segment
+      // Compute absolute start with overlap, but never start inside a non-breakable code block
+      let chunkStartAbs = segment.start;
       if (i > 0 && this.options.overlap > 0) {
         const prevSegment = segments[i - 1]!;
-        const prevContent = markdown.slice(prevSegment.start, prevSegment.end);
-
-        // Calculate how much overlap to take from the previous segment
-        const overlapLength = Math.min(
-          this.options.overlap,
-          prevContent.length,
+        const desired = Math.max(
+          prevSegment.end - Math.min(this.options.overlap, prevSegment.end - prevSegment.start),
+          prevSegment.start,
         );
-        let overlapStart = prevContent.length - overlapLength;
-
-        // Check if the overlap would start in the middle of a code block
-        const overlapAbsoluteStart = prevSegment.start + overlapStart;
-        for (const block of codeBlocks) {
-          if (
-            overlapAbsoluteStart > block.start &&
-            overlapAbsoluteStart < block.end
-          ) {
-            // Overlap would start inside a code block
-            if (block.end <= prevSegment.end) {
-              // The code block ends within the previous segment
-              // Start overlap after the code block to avoid duplication
-              const blockEndInSegment = block.end - prevSegment.start;
-              if (blockEndInSegment < prevContent.length) {
-                overlapStart = blockEndInSegment;
-              }
-            }
-            break;
-          }
+        chunkStartAbs = desired;
+        const enclosing = this.getEnclosingCodeBlock(chunkStartAbs, codeBlocks);
+        if (enclosing && !enclosing.breakable) {
+          // Move start to end of the enclosing non-breakable block
+          chunkStartAbs = enclosing.end;
         }
-
-        // Extract overlap text from the adjusted position
-        const overlapText = prevContent.slice(overlapStart);
-
-        // Prepend overlap to current content
-        content = overlapText + content;
-
-        // Track where the actual content starts (including overlap)
-        chunkStart = prevSegment.start + overlapStart;
       }
 
+      // Extract content from chunkStartAbs to segment.end
+      let content = markdown.slice(chunkStartAbs, segment.end);
+
       chunks.push({
         content: this.options.trim ? content.trim() : content,
-        start: chunkStart, // Now reflects the actual start including overlap
+        start: chunkStartAbs,
         end: segment.end,
-        overlapStart: i > 0 ? segment.start : undefined, // Original segment start for reference
+        overlapStart: i > 0 ? segment.start : undefined,
       });
     }
 
@@ -660,6 +821,7 @@ export class RecursiveMarkdownSplitter {
     rawChunks: Array<{ content: string; start: number; end: number }>,
     markdown: string,
     headers: HeaderToken[],
+    sourceRanges?: Array<{ start: number; end: number; url: string }>,
   ): Chunk[] {
     const chunks: Chunk[] = [];
     const titleCounts = new Map<string, number>();
@@ -686,22 +848,16 @@ export class RecursiveMarkdownSplitter {
 
       headerPath = headerStack.map((h) => h.text);
 
-      // Find title from configured levels - check headers within the chunk first
-      const headersInChunk = headers.filter(
-        (h) =>
-          h.start >= rawChunk.start &&
-          h.start < rawChunk.end &&
-          this.options.headerLevels.includes(h.level as 1 | 2),
-      );
-
-      if (headersInChunk.length > 0) {
-        // Use the first configured header within the chunk
-        title = headersInChunk[0]!.text;
+      // Prefer the deepest header in the path (e.g., H3) for specificity
+      if (headerPath.length > 0) {
+        title = headerPath[headerPath.length - 1]!;
       } else {
-        // Otherwise, use the last configured header before the chunk
+        // Fallback: use last configured header before the chunk if any
         for (let i = headerStack.length - 1; i >= 0; i--) {
           if (
-            this.options.headerLevels.includes(headerStack[i]!.level as 1 | 2)
+            this.options.headerLevels.includes(
+              headerStack[i]!.level as 1 | 2 | 3,
+            )
           ) {
             title = headerStack[i]!.text;
             break;
@@ -719,6 +875,21 @@ export class RecursiveMarkdownSplitter {
         ? `${this.options.idPrefix}-${slug}-${count}`
         : `${slug}-${count}`;
 
+      // Determine sourceLink based on active source ranges: prefer segment start (no overlap)
+      let sourceLink: string | undefined = undefined;
+      const anchorPos = (rawChunk as any).overlapStart ?? rawChunk.start;
+      if (sourceRanges && sourceRanges.length > 0) {
+        const s = anchorPos as number;
+        for (const r of sourceRanges) {
+          if (s >= r.start && s < r.end) {
+            sourceLink = r.url;
+            break;
+          }
+        }
+      }
+
+      console.log(`Chunk Title: ${title}, Source link: ${sourceLink}`);
+
       chunks.push({
         content: rawChunk.content,
         meta: {
@@ -728,6 +899,7 @@ export class RecursiveMarkdownSplitter {
           startChar: rawChunk.start,
           endChar: rawChunk.end,
           headerPath,
+          sourceLink,
         },
       });
     }
diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts
new file mode 100644
index 0000000..d3e815b
--- /dev/null
+++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts
@@ -0,0 +1,60 @@
+import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter';
+
+function getCodeBlockRanges(text: string): Array<{ start: number; end: number }> {
+  const ranges: Array<{ start: number; end: number }> = [];
+  const re = /```[\s\S]*?```/g;
+  let m: RegExpExecArray | null;
+  while ((m = re.exec(text)) !== null) {
+    ranges.push({ start: m.index, end: m.index + m[0]!.length });
+  }
+  return ranges;
+}
+
+function isInside(pos: number, ranges: Array<{ start: number; end: number }>): boolean {
+  return ranges.some((r) => pos > r.start && pos < r.end);
+}
+
+describe('RecursiveMarkdownSplitter - No chunk starts inside code block', () => {
+  it('ensures chunk starts are never within fenced code blocks even with overlap', () => {
+    const longCode = Array.from({ length: 60 }, (_, i) => `line ${i} of code`).join('\n');
+    const md = `# Section One\n\nIntro paragraph text that will be part of the first section.\n\n\n## Subsection\n\nSome text before a large code block.\n\n
+\`\`\`cairo
+fn initializer(ref self: ContractState, owner: ContractAddress) {
+    // example
+    let x = 0;\n${longCode}
+}
+\`\`\`
+
+After the code block there is trailing text to encourage multiple segments and overlap across chunk boundaries. This text continues for a while to ensure we have a next chunk that might try to overlap into the previous code block.`;
+
+    const splitter = new RecursiveMarkdownSplitter({
+      maxChars: 200,
+      minChars: 0,
+      overlap: 50,
+      headerLevels: [1, 2],
+      preserveCodeBlocks: true,
+      trim: false,
+    });
+
+    const chunks = splitter.splitMarkdownToChunks(md);
+    const ranges = getCodeBlockRanges(md);
+
+    // Assert: No chunk start lies strictly inside any NON-breakable fenced code block
+    // Breakable threshold mirrors splitter default: 2x maxChars = 400
+    const codeBlockMaxChars = 400;
+    for (const c of chunks) {
+      const pos = c.meta.startChar;
+      const insideRanges = ranges.filter((r) => pos > r.start && pos < r.end);
+      if (insideRanges.length === 0) continue;
+      // If inside a code block, only allow if that block is oversized (breakable)
+      const smallest = insideRanges.reduce((acc, r) => {
+        if (!acc) return r;
+        const accLen = acc.end - acc.start;
+        const rLen = r.end - r.start;
+        return rLen < accLen ? r : acc;
+      }, insideRanges[0]!);
+      const len = (smallest?.end ?? 0) - (smallest?.start ?? 0);
+      expect(len).toBeGreaterThan(codeBlockMaxChars);
+    }
+  });
+});
diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sourceOverlap.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sourceOverlap.test.ts
new file mode 100644
index 0000000..5de377b
--- /dev/null
+++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sourceOverlap.test.ts
@@ -0,0 +1,33 @@
+import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter';
+
+describe('RecursiveMarkdownSplitter - sourceLink mapping with overlap', () => {
+  it('should use segment start (no-overlap) to resolve sourceLink so it is never undefined', () => {
+    const splitter = new RecursiveMarkdownSplitter({
+      maxChars: 60,
+      minChars: 0,
+      overlap: 20,
+      headerLevels: [1, 2, 3],
+    });
+
+    const md = `---
+
+Sources:
+
+- https://example.com/a
+
+---
+
+# Title
+
+Paragraph one is long enough to cause splitting when combined with overlap. This ensures chunk starts may fall before the source range while the segment starts after it.`;
+
+    const chunks = splitter.splitMarkdownToChunks(md);
+    expect(chunks.length).toBeGreaterThan(1);
+    // All non-ROOT chunks (after first header) should have a sourceLink
+    for (const c of chunks) {
+      if (c.meta.title !== 'ROOT') {
+        expect(c.meta.sourceLink).toBe('https://example.com/a');
+      }
+    }
+  });
+});
diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts
new file mode 100644
index 0000000..01f1b4b
--- /dev/null
+++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts
@@ -0,0 +1,58 @@
+import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter';
+
+describe('RecursiveMarkdownSplitter - Sources block activeSource mapping', () => {
+  it('assigns sourceLink from first URL in Sources block to subsequent chunks', () => {
+    const splitter = new RecursiveMarkdownSplitter({
+      maxChars: 120,
+      minChars: 0,
+      overlap: 0,
+      headerLevels: [1, 2],
+    });
+
+    const text = `
+  ---
+
+Sources:
+
+- https://www.starknet.io/cairo-book/ch00-00-introduction.html
+- https://www.starknet.io/cairo-book/ch00-01-foreword.html
+
+---
+
+# The Cairo Book: Introduction and Learning Resources
+
+Some introduction text.
+
+---
+
+Sources:
+
+- https://www.starknet.io/cairo-book/
+
+---
+
+## About The Cairo Book
+
+More details here.`;
+
+    const chunks = splitter.splitMarkdownToChunks(text);
+
+    // Find chunk under the first H1
+    const introChunk = chunks.find((c) =>
+      c.content.includes('# The Cairo Book: Introduction and Learning Resources'),
+    );
+    expect(introChunk).toBeDefined();
+    expect(introChunk!.meta.sourceLink).toBe(
+      'https://www.starknet.io/cairo-book/ch00-00-introduction.html',
+    );
+
+    // Find chunk under the second header (H2), after second Sources block
+    const aboutChunk = chunks.find((c) =>
+      c.content.includes('## About The Cairo Book'),
+    );
+    expect(aboutChunk).toBeDefined();
+    expect(aboutChunk!.meta.sourceLink).toBe(
+      'https://www.starknet.io/cairo-book/',
+    );
+  });
+});
diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts
index a7ac549..ca8ba7f 100644
--- a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts
+++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts
@@ -123,6 +123,26 @@ More content.`;
 
       expect(chunks[0]!.meta.title).toBe('Header with trailing hashes');
     });
+
+    it('should prefer deepest header (e.g., H3) for title', () => {
+      const splitter = new RecursiveMarkdownSplitter({
+        maxChars: 80,
+        minChars: 0,
+        overlap: 0,
+        headerLevels: [1, 2], // split only on H1/H2, but titles should use deepest header in path
+      });
+
+      const text = `# Chapter
+Intro
+
+### Specific Topic
+Detailed text that should belong to the H3.`;
+
+      const chunks = splitter.splitMarkdownToChunks(text);
+      expect(chunks.length).toBeGreaterThan(0);
+      // Title should be the deepest header in headerPath -> H3
+      expect(chunks[0]!.meta.title).toBe('Specific Topic');
+    });
   });
 
   describe('Code block handling', () => {

From 224d45029d4dfb2910cef74b71776ea718ecc5aa Mon Sep 17 00:00:00 2001
From: enitrat <msaug@protonmail.com>
Date: Sun, 5 Oct 2025 10:55:20 +0200
Subject: [PATCH 2/2] fmt

---
 ingesters/src/utils/RecursiveMarkdownSplitter.ts   | 13 ++++++++++---
 ...MarkdownSplitter.noStartInsideCodeBlock.test.ts | 14 +++++++++++---
 .../RecursiveMarkdownSplitter.sources.test.ts      |  4 +++-
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/ingesters/src/utils/RecursiveMarkdownSplitter.ts b/ingesters/src/utils/RecursiveMarkdownSplitter.ts
index 0541cb3..fcbc8a0 100644
--- a/ingesters/src/utils/RecursiveMarkdownSplitter.ts
+++ b/ingesters/src/utils/RecursiveMarkdownSplitter.ts
@@ -219,7 +219,9 @@ export class RecursiveMarkdownSplitter {
    * Active source becomes the first URL and applies from the end of the block
    * until the start of the next Sources block (or end of document).
    */
-  private parseSourceRanges(markdown: string): Array<{ start: number; end: number; url: string }> {
+  private parseSourceRanges(
+    markdown: string,
+  ): Array<{ start: number; end: number; url: string }> {
     const lines = markdown.split('\n');
     const ranges: Array<{ start: number; end: number; url: string }> = [];
 
@@ -247,7 +249,11 @@ export class RecursiveMarkdownSplitter {
     };
 
     // Locate all source blocks (start/end + first URL)
-    const blocks: Array<{ blockStartLine: number; blockEndLine: number; firstUrl?: string }> = [];
+    const blocks: Array<{
+      blockStartLine: number;
+      blockEndLine: number;
+      firstUrl?: string;
+    }> = [];
     for (let i = 0; i < lines.length; i++) {
       if (!isDashLine(lines[i]!)) continue;
       // Scan ahead for Sources: header within the dashed block
@@ -789,7 +795,8 @@ export class RecursiveMarkdownSplitter {
       if (i > 0 && this.options.overlap > 0) {
         const prevSegment = segments[i - 1]!;
         const desired = Math.max(
-          prevSegment.end - Math.min(this.options.overlap, prevSegment.end - prevSegment.start),
+          prevSegment.end -
+            Math.min(this.options.overlap, prevSegment.end - prevSegment.start),
           prevSegment.start,
         );
         chunkStartAbs = desired;
diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts
index d3e815b..08eeef2 100644
--- a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts
+++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.noStartInsideCodeBlock.test.ts
@@ -1,6 +1,8 @@
 import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter';
 
-function getCodeBlockRanges(text: string): Array<{ start: number; end: number }> {
+function getCodeBlockRanges(
+  text: string,
+): Array<{ start: number; end: number }> {
   const ranges: Array<{ start: number; end: number }> = [];
   const re = /```[\s\S]*?```/g;
   let m: RegExpExecArray | null;
@@ -10,13 +12,19 @@ function getCodeBlockRanges(text: string): Array<{ start: number; end: number }>
   return ranges;
 }
 
-function isInside(pos: number, ranges: Array<{ start: number; end: number }>): boolean {
+function isInside(
+  pos: number,
+  ranges: Array<{ start: number; end: number }>,
+): boolean {
   return ranges.some((r) => pos > r.start && pos < r.end);
 }
 
 describe('RecursiveMarkdownSplitter - No chunk starts inside code block', () => {
   it('ensures chunk starts are never within fenced code blocks even with overlap', () => {
-    const longCode = Array.from({ length: 60 }, (_, i) => `line ${i} of code`).join('\n');
+    const longCode = Array.from(
+      { length: 60 },
+      (_, i) => `line ${i} of code`,
+    ).join('\n');
     const md = `# Section One\n\nIntro paragraph text that will be part of the first section.\n\n\n## Subsection\n\nSome text before a large code block.\n\n
 \`\`\`cairo
 fn initializer(ref self: ContractState, owner: ContractAddress) {
diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts
index 01f1b4b..fd67e6c 100644
--- a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts
+++ b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.sources.test.ts
@@ -39,7 +39,9 @@ More details here.`;
 
     // Find chunk under the first H1
     const introChunk = chunks.find((c) =>
-      c.content.includes('# The Cairo Book: Introduction and Learning Resources'),
+      c.content.includes(
+        '# The Cairo Book: Introduction and Learning Resources',
+      ),
     );
     expect(introChunk).toBeDefined();
     expect(introChunk!.meta.sourceLink).toBe(