diff --git a/packages/ingester/src/ingesters/CairoBookIngester.ts b/packages/ingester/src/ingesters/CairoBookIngester.ts index 188b2d12..ebf14e2f 100644 --- a/packages/ingester/src/ingesters/CairoBookIngester.ts +++ b/packages/ingester/src/ingesters/CairoBookIngester.ts @@ -10,11 +10,11 @@ import { VectorStore } from '@cairo-coder/agents/db/postgresVectorStore'; import { logger } from '@cairo-coder/agents/utils/index'; import * as fs from 'fs/promises'; import * as path from 'path'; +import { calculateHash } from '../utils/contentUtils'; import { - addSectionWithSizeLimit, - calculateHash, - createAnchor, -} from '../utils/contentUtils'; + RecursiveMarkdownSplitter, + SplitOptions, +} from '../utils/RecursiveMarkdownSplitter'; /** * Ingester for the Cairo Book documentation @@ -63,109 +63,50 @@ export class CairoBookIngester extends MarkdownIngester { } /** - * Chunk the core library summary file by H1 headers + * Chunk the core library summary file using RecursiveMarkdownSplitter * - * This function takes the markdown content and splits it into sections - * based on H1 headers (# Header). Each section becomes a separate chunk - * with its content hashed for uniqueness. + * This function takes the markdown content and splits it using a recursive + * strategy that respects headers, code blocks, and maintains overlap between chunks. * * @param text - The markdown content to chunk - * @returns Promise[]> - Array of document chunks, one per H1 section + * @returns Promise[]> - Array of document chunks */ async chunkSummaryFile(text: string): Promise[]> { - const content = text; - const sections: ParsedSection[] = []; - - // We can't use a simple global regex, as it will incorrectly match commented - // lines inside code blocks. Instead, we'll parse line-by-line to find - // "real" headers, while keeping track of whether we're inside a code block. - - const realHeaders: { title: string; startIndex: number }[] = []; - const lines = content.split('\n'); - let inCodeBlock = false; - let charIndex = 0; - - for (const line of lines) { - // Toggle the state if we encounter a code block fence - if (line.trim().startsWith('```')) { - inCodeBlock = !inCodeBlock; - } - - // A real H1 header is a line that starts with '# ' and is NOT in a code block. - // We use a specific regex to ensure it's a proper H1. - const h1Match = line.match(/^#{1,2}\s+(.+)$/); - if (!inCodeBlock && h1Match) { - realHeaders.push({ - title: h1Match[1].trim(), - startIndex: charIndex, - }); - } - - // Move the character index forward, accounting for the newline character - charIndex += line.length + 1; - } + // Configure the splitter with appropriate settings + const splitOptions: SplitOptions = { + maxChars: 2048, + minChars: 500, + overlap: 256, + headerLevels: [1, 2], // Split on H1 and H2 headers + preserveCodeBlocks: true, + idPrefix: 'cairo-book', + trim: true, + }; - // If no H1 headers were found, treat the entire content as one section. - if (realHeaders.length === 0) { - logger.debug( - 'No H1 headers found, creating single section from entire content', - ); - addSectionWithSizeLimit( - sections, - 'Core Library Documentation', - content.trim(), - 20000, - createAnchor('Core Library Documentation'), - ); - } else { - // Process each valid H1 header found - for (let i = 0; i < realHeaders.length; i++) { - const header = realHeaders[i]; - const headerTitle = header.title; - const headerStartIndex = header.startIndex; - - // Determine the end of this section (start of next header or end of content) - const nextHeaderIndex = - i < realHeaders.length - 1 - ? realHeaders[i + 1].startIndex - : content.length; - - // Extract section content from the start of the header line to before the next header - const sectionContent = content - .slice(headerStartIndex, nextHeaderIndex) - .trim(); - - logger.debug(`Adding section: ${headerTitle}`); - - addSectionWithSizeLimit( - sections, - headerTitle, - sectionContent, - 20000, - createAnchor(headerTitle), - ); - } - } + // Create the splitter and split the content + const splitter = new RecursiveMarkdownSplitter(splitOptions); + const chunks = splitter.splitMarkdownToChunks(text); - const localChunks: Document[] = []; - - // Create a document for each section - sections.forEach((section: ParsedSection, index: number) => { - const hash: string = calculateHash(section.content); - localChunks.push( - new Document({ - pageContent: section.content, - metadata: { - name: section.title, - title: section.title, - chunkNumber: index, - contentHash: hash, - uniqueId: `${section.title}-${index}`, - sourceLink: ``, - source: this.source, // Using placeholder for 'this.source' - }, - }), - ); + logger.info( + `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`, + ); + + // Convert chunks to Document format + const localChunks: Document[] = chunks.map((chunk) => { + const contentHash = calculateHash(chunk.content); + + return new Document({ + pageContent: chunk.content, + metadata: { + name: chunk.meta.title, + title: chunk.meta.title, + chunkNumber: chunk.meta.chunkNumber, // Already 0-based + contentHash: contentHash, + uniqueId: chunk.meta.uniqueId, + sourceLink: '', + source: this.source, + }, + }); }); return localChunks; diff --git a/packages/ingester/src/ingesters/CoreLibDocsIngester.ts b/packages/ingester/src/ingesters/CoreLibDocsIngester.ts index 7162acee..0f78c358 100644 --- a/packages/ingester/src/ingesters/CoreLibDocsIngester.ts +++ b/packages/ingester/src/ingesters/CoreLibDocsIngester.ts @@ -2,19 +2,15 @@ import * as fs from 'fs/promises'; import * as path from 'path'; import { BookConfig } from '../utils/types'; import { MarkdownIngester } from './MarkdownIngester'; -import { - BookChunk, - DocumentSource, - ParsedSection, -} from '@cairo-coder/agents/types/index'; +import { BookChunk, DocumentSource } from '@cairo-coder/agents/types/index'; import { Document } from '@langchain/core/documents'; import { VectorStore } from '@cairo-coder/agents/db/postgresVectorStore'; import { logger } from '@cairo-coder/agents/utils/index'; +import { calculateHash } from '../utils/contentUtils'; import { - addSectionWithSizeLimit, - calculateHash, - createAnchor, -} from '../utils/contentUtils'; + RecursiveMarkdownSplitter, + SplitOptions, +} from '../utils/RecursiveMarkdownSplitter'; /** * Ingester for the Cairo Core Library documentation @@ -63,84 +59,54 @@ export class CoreLibDocsIngester extends MarkdownIngester { } /** - * Chunk the core library summary file by H1 headers + * Chunk the core library summary file using RecursiveMarkdownSplitter * - * This function takes the markdown content and splits it into sections - * based on H1 headers (# Header). Each section becomes a separate chunk - * with its content hashed for uniqueness. + * This function takes the markdown content and splits it using a recursive + * strategy that respects headers, code blocks, and maintains overlap between chunks. * * @param text - The markdown content to chunk - * @returns Promise[]> - Array of document chunks, one per H1 section + * @returns Promise[]> - Array of document chunks */ async chunkCorelibSummaryFile(text: string): Promise[]> { - const content = text; - const sections: ParsedSection[] = []; - - // Regex to match H1 headers (# Header) - const headerRegex = /^(#{1})\s+(.+)$/gm; - const matches = Array.from(content.matchAll(headerRegex)); - - let lastSectionEndIndex = 0; - - // Process each H1 header found - for (let i = 0; i < matches.length; i++) { - const match = matches[i]; - const headerTitle = match[2].trim(); - const headerStartIndex = match.index!; - - // Determine the end of this section (start of next header or end of content) - const nextHeaderIndex = - i < matches.length - 1 ? matches[i + 1].index! : content.length; - - // Extract section content from after the header to before the next header - const sectionContent = content - .slice(headerStartIndex, nextHeaderIndex) - .trim(); - - logger.debug(`Adding section: ${headerTitle}`); - - addSectionWithSizeLimit( - sections, - headerTitle, - sectionContent, - 20000, - createAnchor(headerTitle), - ); - } + logger.info( + 'Using RecursiveMarkdownSplitter to chunk Core Library documentation', + ); - // If no H1 headers found, treat the entire content as one section - if (sections.length === 0) { - logger.debug( - 'No H1 headers found, creating single section from entire content', - ); - addSectionWithSizeLimit( - sections, - 'Core Library Documentation', - content, - 20000, - createAnchor('Core Library Documentation'), - ); - } + // Configure the splitter with appropriate settings + const splitOptions: SplitOptions = { + maxChars: 2048, + minChars: 500, + overlap: 256, + headerLevels: [1, 2], // Split on H1 and H2 headers + preserveCodeBlocks: true, + idPrefix: 'corelib', + trim: true, + }; - const localChunks: Document[] = []; - - // Create a document for each section - sections.forEach((section: ParsedSection, index: number) => { - const hash: string = calculateHash(section.content); - localChunks.push( - new Document({ - pageContent: section.content, - metadata: { - name: section.title, - title: section.title, - chunkNumber: index, - contentHash: hash, - uniqueId: `${section.title}-${index}`, - sourceLink: ``, - source: this.source, - }, - }), - ); + // Create the splitter and split the content + const splitter = new RecursiveMarkdownSplitter(splitOptions); + const chunks = splitter.splitMarkdownToChunks(text); + + logger.info( + `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`, + ); + + // Convert chunks to Document format + const localChunks: Document[] = chunks.map((chunk) => { + const contentHash = calculateHash(chunk.content); + + return new Document({ + pageContent: chunk.content, + metadata: { + name: chunk.meta.title, + title: chunk.meta.title, + chunkNumber: chunk.meta.chunkNumber, // Already 0-based + contentHash: contentHash, + uniqueId: chunk.meta.uniqueId, + sourceLink: '', + source: this.source, + }, + }); }); return localChunks; diff --git a/packages/ingester/src/utils/RecursiveMarkdownSplitter.ts b/packages/ingester/src/utils/RecursiveMarkdownSplitter.ts new file mode 100644 index 00000000..56856f59 --- /dev/null +++ b/packages/ingester/src/utils/RecursiveMarkdownSplitter.ts @@ -0,0 +1,749 @@ +import { logger } from '@cairo-coder/agents/utils/index'; + +// Public API interfaces +export interface SplitOptions { + /** Maximum characters per chunk (UTF-16 .length), not counting overlap. Default: 2048 */ + maxChars?: number; + /** Minimum characters per chunk. Chunks smaller than this will be merged with adjacent chunks. Default: 500 */ + minChars?: number; + /** Characters of backward overlap between consecutive chunks. Default: 256 */ + overlap?: number; + /** Which header levels are allowed as primary split points. Default: [1, 2] */ + headerLevels?: (1 | 2)[]; + /** If true, do not split inside fenced code blocks. Default: true */ + preserveCodeBlocks?: boolean; + /** Optional prefix for generated unique IDs */ + idPrefix?: string; + /** Whether to trim whitespace around chunks. Default: true */ + trim?: boolean; +} + +export interface ChunkMeta { + /** Title derived from the last seen header among the configured levels */ + title: string; + /** Index of this chunk for the given title (0-based) */ + chunkNumber: number; + /** Globally unique ID: `${slug(title)}-${chunkNumber}` (plus idPrefix if provided) */ + uniqueId: string; + /** Inclusive start & exclusive end character offsets in the original string */ + startChar: number; + endChar: number; + /** Full header path stack (e.g., ["Intro", "Goals"]) */ + headerPath: string[]; +} + +export interface Chunk { + content: string; + meta: ChunkMeta; +} + +// Internal data structures +interface HeaderToken { + level: number; // 1..6 + text: string; + start: number; // index in original string + end: number; +} + +interface CodeBlockToken { + start: number; + end: number; + fence: '```' | '~~~'; + infoString?: string; // e.g. "ts", "python" +} + +interface Segment { + start: number; + end: number; +} + +interface Tokens { + headers: HeaderToken[]; + codeBlocks: CodeBlockToken[]; +} + +export class RecursiveMarkdownSplitter { + private readonly options: Required; + + constructor(options: SplitOptions = {}) { + this.options = { + maxChars: options.maxChars ?? 2048, + minChars: options.minChars ?? 500, + overlap: options.overlap ?? 256, + headerLevels: options.headerLevels ?? [1, 2], + preserveCodeBlocks: options.preserveCodeBlocks ?? true, + idPrefix: options.idPrefix ?? '', + trim: options.trim ?? true, + }; + + // Validate options + if (this.options.maxChars <= 0) { + throw new Error( + `maxChars must be positive, got ${this.options.maxChars}`, + ); + } + if (this.options.minChars < 0) { + throw new Error( + `minChars must be non-negative, got ${this.options.minChars}`, + ); + } + if (this.options.overlap < 0) { + throw new Error( + `overlap must be non-negative, got ${this.options.overlap}`, + ); + } + if (this.options.overlap >= this.options.maxChars) { + throw new Error( + `Overlap (${this.options.overlap}) must be less than maxChars (${this.options.maxChars})`, + ); + } + if (this.options.minChars >= this.options.maxChars) { + throw new Error( + `minChars (${this.options.minChars}) must be less than maxChars (${this.options.maxChars})`, + ); + } + if (this.options.headerLevels.length === 0) { + throw new Error('headerLevels must contain at least one level'); + } + if (this.options.headerLevels.some((level) => level < 1 || level > 6)) { + throw new Error('headerLevels must contain values between 1 and 6'); + } + } + + /** + * Main entry point to split markdown into chunks + */ + public splitMarkdownToChunks(markdown: string): Chunk[] { + // Handle empty input + if (!markdown || markdown.trim().length === 0) { + return []; + } + + // Normalize line endings + const normalizedMarkdown = markdown.replace(/\r\n/g, '\n'); + + // Tokenize the markdown + const tokens = this.tokenize(normalizedMarkdown); + + // Recursively split into segments + const rootSegment: Segment = { start: 0, end: normalizedMarkdown.length }; + const segments = this.recursivelySplit( + rootSegment, + normalizedMarkdown, + tokens, + ); + + // Merge small segments to avoid tiny chunks + const mergedSegments = this.mergeSmallSegments( + segments, + normalizedMarkdown, + tokens.codeBlocks, + ); + + // Apply overlap and assemble chunks + const rawChunks = this.assembleChunksWithOverlap( + mergedSegments, + normalizedMarkdown, + tokens.codeBlocks, + ); + + // Attach metadata + return this.attachMetadata(rawChunks, normalizedMarkdown, tokens.headers); + } + + /** + * Tokenize markdown to extract headers and code blocks + */ + private tokenize(markdown: string): Tokens { + const headers: HeaderToken[] = []; + const codeBlocks: CodeBlockToken[] = []; + + // Find all headers + const headerRegex = /^(#{1,6})\s+(.+?)(?:\s*#*)?$/gm; + let match: RegExpExecArray | null; + + while ((match = headerRegex.exec(markdown)) !== null) { + const level = match[1].length; + const text = match[2].trim(); + const start = match.index; + const end = match.index + match[0].length; + + headers.push({ level, text, start, end }); + } + + // Find all code blocks + this.findCodeBlocks(markdown, codeBlocks); + + // Filter out headers that are inside code blocks + const filteredHeaders = headers.filter((header) => { + return !codeBlocks.some( + (block) => header.start >= block.start && header.end <= block.end, + ); + }); + + return { headers: filteredHeaders, codeBlocks }; + } + + /** + * Find all fenced code blocks in the markdown + */ + private findCodeBlocks(markdown: string, codeBlocks: CodeBlockToken[]): void { + const lines = markdown.split('\n'); + let inCodeBlock = false; + let currentBlock: Partial | null = null; + let charIndex = 0; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + const fenceMatch = line.match(/^(```+|~~~+)(.*)$/); + + if (fenceMatch) { + const fence = fenceMatch[1].substring(0, 3) as '```' | '~~~'; + + if (!inCodeBlock) { + // Starting a code block + inCodeBlock = true; + currentBlock = { + start: charIndex, + fence, + infoString: fenceMatch[2].trim() || undefined, + }; + } else if (currentBlock && line.startsWith(currentBlock.fence)) { + // Ending a code block + currentBlock.end = charIndex + line.length; + codeBlocks.push(currentBlock as CodeBlockToken); + inCodeBlock = false; + currentBlock = null; + } + } + + charIndex += line.length + 1; // +1 for newline + } + + // Handle unclosed code block + if (currentBlock && inCodeBlock) { + logger.warn( + 'Unclosed code block detected, treating remaining content as plain text', + ); + } + } + + /** + * Recursively split a segment into smaller segments + */ + private recursivelySplit( + segment: Segment, + markdown: string, + tokens: Tokens, + ): Segment[] { + const segmentText = markdown.slice(segment.start, segment.end); + + // Base case: segment is within size limit + if (segmentText.length <= this.options.maxChars) { + return [segment]; + } + + // Try to split by headers + const headerSplits = this.splitByHeaders(segment, markdown, tokens); + if (headerSplits.length > 1) { + return headerSplits.flatMap((s) => + this.recursivelySplit(s, markdown, tokens), + ); + } + + // Try to split by paragraphs + const paragraphSplits = this.splitByParagraphs( + segment, + markdown, + tokens.codeBlocks, + ); + if (paragraphSplits.length > 1) { + return paragraphSplits.flatMap((s) => + this.recursivelySplit(s, markdown, tokens), + ); + } + + // Try to split by lines + const lineSplits = this.splitByLines(segment, markdown, tokens.codeBlocks); + if (lineSplits.length > 1) { + return lineSplits.flatMap((s) => + this.recursivelySplit(s, markdown, tokens), + ); + } + + // Cannot split further - return as is (may exceed maxChars) + if (segmentText.length > this.options.maxChars) { + // Check if it's a single code block + const isCodeBlock = tokens.codeBlocks.some( + (block) => block.start <= segment.start && block.end >= segment.end, + ); + if (isCodeBlock) { + logger.warn( + `Code block exceeds maxChars (${segmentText.length} > ${this.options.maxChars})`, + ); + } else { + logger.warn( + `Segment exceeds maxChars and cannot be split further (${segmentText.length} > ${this.options.maxChars})`, + ); + } + } + + return [segment]; + } + + /** + * Try to split segment by headers + */ + private splitByHeaders( + segment: Segment, + markdown: string, + tokens: Tokens, + ): Segment[] { + // Find headers within this segment that are configured split levels + const segmentHeaders = tokens.headers.filter( + (h) => + h.start >= segment.start && + h.end <= segment.end && + this.options.headerLevels.includes(h.level as 1 | 2), + ); + + if (segmentHeaders.length === 0) { + return [segment]; + } + + // Sort by position + segmentHeaders.sort((a, b) => a.start - b.start); + + const segments: Segment[] = []; + + // Handle content before first header + if (segmentHeaders[0].start > segment.start) { + segments.push({ start: segment.start, end: segmentHeaders[0].start }); + } + + // Process each header + for (let i = 0; i < segmentHeaders.length; i++) { + const header = segmentHeaders[i]; + const nextHeader = + i + 1 < segmentHeaders.length ? segmentHeaders[i + 1] : null; + + // Determine where this header's section ends + const sectionEnd = nextHeader ? nextHeader.start : segment.end; + + // Create segment starting from this header + segments.push({ start: header.start, end: sectionEnd }); + } + + // Validate: ensure complete coverage with no gaps or overlaps + if (segments.length > 0) { + // Check first segment starts at segment beginning + if (segments[0].start !== segment.start) { + logger.error( + `First segment doesn't start at segment beginning: ${segments[0].start} vs ${segment.start}`, + ); + } + + // Check last segment ends at segment end + if (segments[segments.length - 1].end !== segment.end) { + logger.error( + `Last segment doesn't end at segment end: ${segments[segments.length - 1].end} vs ${segment.end}`, + ); + } + + // Check for gaps or overlaps between consecutive segments + for (let i = 1; i < segments.length; i++) { + if (segments[i].start !== segments[i - 1].end) { + logger.error( + `Gap or overlap detected between segments: ${segments[i - 1].end} to ${segments[i].start}`, + ); + } + } + } + + return segments.length > 1 ? segments : [segment]; + } + + /** + * Try to split segment by paragraphs (double newlines) + */ + private splitByParagraphs( + segment: Segment, + markdown: string, + codeBlocks: CodeBlockToken[], + ): Segment[] { + const segmentText = markdown.slice(segment.start, segment.end); + const segments: Segment[] = []; + + // Find paragraph boundaries (double newlines) + const paragraphRegex = /\n\n+/g; + let currentStart = 0; + let match: RegExpExecArray | null; + const splitPoints: number[] = []; + + // Collect all valid split points + while ((match = paragraphRegex.exec(segmentText)) !== null) { + const splitPoint = segment.start + match.index + match[0].length; + // Check if split point is inside a code block + if (!this.isInsideCodeBlock(splitPoint, codeBlocks)) { + splitPoints.push(match.index + match[0].length); + } + } + + // Create segments based on split points + for (const splitPoint of splitPoints) { + segments.push({ + start: segment.start + currentStart, + end: segment.start + splitPoint, + }); + currentStart = splitPoint; + } + + // Add final segment if there's remaining content + if (currentStart < segmentText.length) { + segments.push({ + start: segment.start + currentStart, + end: segment.end, + }); + } + + return segments.length > 1 ? segments : [segment]; + } + + /** + * Try to split segment by lines + */ + private splitByLines( + segment: Segment, + markdown: string, + codeBlocks: CodeBlockToken[], + ): Segment[] { + const segmentText = markdown.slice(segment.start, segment.end); + const lines = segmentText.split('\n'); + const segments: Segment[] = []; + + let currentStart = segment.start; + let currentLength = 0; + let lineStart = segment.start; + + for (let i = 0; i < lines.length; i++) { + const lineLength = lines[i].length + 1; // +1 for newline + + if ( + currentLength + lineLength > this.options.maxChars && + currentLength > 0 + ) { + // Check if we can split here + if (!this.isInsideCodeBlock(lineStart, codeBlocks)) { + segments.push({ + start: currentStart, + end: lineStart, + }); + currentStart = lineStart; + currentLength = lineLength; + } else { + currentLength += lineLength; + } + } else { + currentLength += lineLength; + } + + lineStart += lineLength; + } + + // Add final segment + if (currentStart < segment.end) { + segments.push({ + start: currentStart, + end: segment.end, + }); + } + + return segments.length > 1 ? segments : [segment]; + } + + /** + * Check if a position is inside a code block + */ + private isInsideCodeBlock( + position: number, + codeBlocks: CodeBlockToken[], + ): boolean { + return codeBlocks.some( + (block) => position >= block.start && position < block.end, + ); + } + + /** + * Merge segments that are too small with adjacent segments + */ + private mergeSmallSegments( + segments: Segment[], + markdown: string, + codeBlocks: CodeBlockToken[], + ): Segment[] { + if (segments.length <= 1) return segments; + + const mergedSegments: Segment[] = []; + let currentSegment: Segment | null = null; + + for (let i = 0; i < segments.length; i++) { + const segment = segments[i]; + const segmentLength = segment.end - segment.start; + const isLastSegment = i === segments.length - 1; + + if (currentSegment === null) { + currentSegment = { ...segment }; + } else { + const currentLength = currentSegment.end - currentSegment.start; + const combinedLength = + currentSegment.end - currentSegment.start + segmentLength; + + // Determine if we should merge + const shouldMerge = + // Either segment is too small + ((segmentLength < this.options.minChars || + currentLength < this.options.minChars) && + // And merging won't exceed maxChars + combinedLength <= this.options.maxChars) || + // OR this is the last segment and it's too small + (isLastSegment && segmentLength < this.options.minChars); + + if (shouldMerge) { + // Merge by extending current segment + currentSegment.end = segment.end; + } else { + // Don't merge - push current and start new + mergedSegments.push(currentSegment); + currentSegment = { ...segment }; + } + } + } + + // Don't forget the last segment + if (currentSegment !== null) { + // Special handling for final segment if it's still too small + const currentLength = currentSegment.end - currentSegment.start; + if (currentLength < this.options.minChars && mergedSegments.length > 0) { + // Try to merge with previous segment + const lastMerged = mergedSegments[mergedSegments.length - 1]; + const combinedLength = + lastMerged.end - lastMerged.start + currentLength; + + if (combinedLength <= this.options.maxChars * 1.5) { + // Allow some flexibility for the final merge to avoid tiny final chunks + lastMerged.end = currentSegment.end; + } else { + // Can't merge without significantly exceeding limits + mergedSegments.push(currentSegment); + } + } else { + mergedSegments.push(currentSegment); + } + } + + // Final pass: ensure no segment ends in the middle of a code block + const finalSegments: Segment[] = []; + for (const segment of mergedSegments) { + let adjustedEnd = segment.end; + + // Check if segment end is inside a code block + for (const block of codeBlocks) { + if (segment.end > block.start && segment.end < block.end) { + // Extend to include the entire code block + adjustedEnd = block.end; + break; + } + } + + finalSegments.push({ + start: segment.start, + end: adjustedEnd, + }); + } + + return finalSegments; + } + + /** + * Assemble chunks with overlap handling + */ + private assembleChunksWithOverlap( + segments: Segment[], + markdown: string, + codeBlocks: CodeBlockToken[], + ): Array<{ + content: string; + start: number; + end: number; + overlapStart?: number; + }> { + if (segments.length === 0) return []; + + const chunks: Array<{ + content: string; + start: number; + end: number; + overlapStart?: number; + }> = []; + + for (let i = 0; i < segments.length; i++) { + const segment = segments[i]; + let content = markdown.slice(segment.start, segment.end); + let chunkStart = segment.start; + + // For chunks after the first, prepend overlap from previous segment + if (i > 0 && this.options.overlap > 0) { + const prevSegment = segments[i - 1]; + const prevContent = markdown.slice(prevSegment.start, prevSegment.end); + + // Calculate how much overlap to take from the previous segment + const overlapLength = Math.min( + this.options.overlap, + prevContent.length, + ); + let overlapStart = prevContent.length - overlapLength; + + // Check if the overlap would start in the middle of a code block + const overlapAbsoluteStart = prevSegment.start + overlapStart; + for (const block of codeBlocks) { + if ( + overlapAbsoluteStart > block.start && + overlapAbsoluteStart < block.end + ) { + // Overlap would start inside a code block + if (block.end <= prevSegment.end) { + // The code block ends within the previous segment + // Start overlap after the code block to avoid duplication + const blockEndInSegment = block.end - prevSegment.start; + if (blockEndInSegment < prevContent.length) { + overlapStart = blockEndInSegment; + } + } + break; + } + } + + // Extract overlap text from the adjusted position + const overlapText = prevContent.slice(overlapStart); + + // Prepend overlap to current content + content = overlapText + content; + + // Track where the actual content starts (including overlap) + chunkStart = prevSegment.start + overlapStart; + } + + chunks.push({ + content: this.options.trim ? content.trim() : content, + start: chunkStart, // Now reflects the actual start including overlap + end: segment.end, + overlapStart: i > 0 ? segment.start : undefined, // Original segment start for reference + }); + } + + return chunks; + } + + /** + * Attach metadata to chunks + */ + private attachMetadata( + rawChunks: Array<{ content: string; start: number; end: number }>, + markdown: string, + headers: HeaderToken[], + ): Chunk[] { + const chunks: Chunk[] = []; + const titleCounts = new Map(); + + for (const rawChunk of rawChunks) { + // Find the last header before or within this chunk that's in our configured levels + let title = 'ROOT'; + let headerPath: string[] = []; + + // Build full header path from all headers up to the end of this chunk + const allHeadersBeforeEnd = headers.filter((h) => h.start < rawChunk.end); + const headerStack: { level: number; text: string }[] = []; + + for (const header of allHeadersBeforeEnd) { + // Pop headers from stack that are same or lower level + while ( + headerStack.length > 0 && + headerStack[headerStack.length - 1].level >= header.level + ) { + headerStack.pop(); + } + headerStack.push({ level: header.level, text: header.text }); + } + + headerPath = headerStack.map((h) => h.text); + + // Find title from configured levels - check headers within the chunk first + const headersInChunk = headers.filter( + (h) => + h.start >= rawChunk.start && + h.start < rawChunk.end && + this.options.headerLevels.includes(h.level as 1 | 2), + ); + + if (headersInChunk.length > 0) { + // Use the first configured header within the chunk + title = headersInChunk[0].text; + } else { + // Otherwise, use the last configured header before the chunk + for (let i = headerStack.length - 1; i >= 0; i--) { + if ( + this.options.headerLevels.includes(headerStack[i].level as 1 | 2) + ) { + title = headerStack[i].text; + break; + } + } + } + + // Track chunk numbers per title (0-based) + const count = titleCounts.get(title) || 0; + titleCounts.set(title, count + 1); + + // Generate unique ID using 0-based numbering + const slug = this.slugify(title); + const uniqueId = this.options.idPrefix + ? `${this.options.idPrefix}-${slug}-${count}` + : `${slug}-${count}`; + + chunks.push({ + content: rawChunk.content, + meta: { + title, + chunkNumber: count, + uniqueId, + startChar: rawChunk.start, + endChar: rawChunk.end, + headerPath, + }, + }); + } + + return chunks; + } + + /** + * Convert a string to a slug + */ + private slugify(text: string): string { + return text + .toLowerCase() + .replace(/[^\w\s-]/g, '') // Remove non-word characters + .replace(/\s+/g, '-') // Replace spaces with hyphens + .replace(/-+/g, '-') // Replace multiple hyphens with single + .replace(/^-+|-+$/g, ''); // Remove leading/trailing hyphens + } +} + +// Export the main function as well for convenience +export function splitMarkdownToChunks( + markdown: string, + opts?: SplitOptions, +): Chunk[] { + const splitter = new RecursiveMarkdownSplitter(opts); + return splitter.splitMarkdownToChunks(markdown); +} diff --git a/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.finalChunk.test.ts b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.finalChunk.test.ts new file mode 100644 index 00000000..c4249322 --- /dev/null +++ b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.finalChunk.test.ts @@ -0,0 +1,169 @@ +import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter'; + +describe('RecursiveMarkdownSplitter - Final chunk handling', () => { + it('should deterministically handle final tiny chunks', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 50, + overlap: 10, + headerLevels: [1, 2], + trim: true, + }); + + // Text that will create a tiny final chunk + const text = `# Section One +This is the first section with enough content to meet the minimum character requirement. + +# Section Two +This is the second section with enough content to meet the minimum character requirement. + +# Section Three +Tiny bit.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Debug output + console.log( + 'Chunks:', + chunks.map((c) => ({ + title: c.meta.title, + length: c.content.length, + preview: c.content.substring(0, 30).replace(/\n/g, '\\n'), + })), + ); + + // The final tiny chunk should be merged with the previous one + const lastChunk = chunks[chunks.length - 1]; + + // Verify the tiny content was handled appropriately + const hasTinyContent = chunks.some((c) => c.content.includes('Tiny bit')); + expect(hasTinyContent).toBe(true); + + // The tiny section should not be on its own + const tinyChunk = chunks.find((c) => c.meta.title === 'Section Three'); + if (tinyChunk) { + expect(tinyChunk.content.length).toBeGreaterThanOrEqual(50); // Should meet minChars + } + }); + + it('should handle multiple tiny segments at the end', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 40, + overlap: 0, + headerLevels: [1], + trim: true, + }); + + const text = `# Main Section +This is the main section with sufficient content to be a proper chunk. + +# Tiny 1 +Small. + +# Tiny 2 +Also small. + +# Tiny 3 +Very small.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // All tiny sections should be merged together + expect(chunks.length).toBe(2); + + const lastChunk = chunks[chunks.length - 1]; + expect(lastChunk.content).toContain('Tiny 1'); + expect(lastChunk.content).toContain('Tiny 2'); + expect(lastChunk.content).toContain('Tiny 3'); + }); + + it('should not exceed maxChars significantly when merging final chunk', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 30, + overlap: 0, + headerLevels: [1], + trim: true, + }); + + const text = `# Section One +This section has exactly the right amount of content. + +# Section Two +This section also has exactly the right amount of content. + +# Tiny +End.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Check that tiny chunks are handled appropriately + const lastChunk = chunks[chunks.length - 1]; + + // If there's a tiny chunk, it should either be merged or meet minChars + if (lastChunk.meta.title === 'Tiny') { + expect(lastChunk.content.length).toBeGreaterThanOrEqual(30); + } + + // No chunk should be excessively large + chunks.forEach((chunk) => { + expect(chunk.content.length).toBeLessThanOrEqual(75); // 1.5x maxChars + }); + }); + + it('should handle edge case where all segments are tiny', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 50, + overlap: 0, + headerLevels: [1], + trim: true, + }); + + const text = `# A +Short. + +# B +Brief. + +# C +Tiny.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // All should be merged into one chunk + expect(chunks.length).toBe(1); + expect(chunks[0].content).toContain('# A'); + expect(chunks[0].content).toContain('# B'); + expect(chunks[0].content).toContain('# C'); + }); + + it('should preserve code blocks when merging final chunks', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 50, + overlap: 0, + preserveCodeBlocks: true, + trim: true, + }); + + const text = `# Section One +Content before code block. + +\`\`\`python +def hello(): + print("Hello") +\`\`\` + +# Tiny Section +End.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Code block should be preserved intact + const codeChunk = chunks.find((c) => c.content.includes('def hello()')); + expect(codeChunk).toBeDefined(); + expect(codeChunk!.content).toMatch(/```python[\s\S]*?```/); + }); +}); diff --git a/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.minChars.test.ts b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.minChars.test.ts new file mode 100644 index 00000000..a5b6578a --- /dev/null +++ b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.minChars.test.ts @@ -0,0 +1,135 @@ +import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter'; + +describe('RecursiveMarkdownSplitter - minChars functionality', () => { + it('should merge segments smaller than minChars', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 200, + minChars: 100, + overlap: 0, + headerLevels: [1, 2], + }); + + const text = `# Section 1 +Short content. + +# Section 2 +Also short. + +# Section 3 +This is a bit longer content that might be closer to the minimum.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // With minChars=100, the short sections should be merged + expect(chunks.length).toBeLessThan(3); + + // All chunks should be at least minChars (except possibly the last one) + chunks.forEach((chunk, index) => { + if (index < chunks.length - 1) { + expect(chunk.content.length).toBeGreaterThanOrEqual(100); + } + }); + }); + + it('should not merge if it would exceed maxChars', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 50, + overlap: 0, + headerLevels: [1, 2], + }); + + const text = `# Section 1 +This section has exactly enough content to be close to the max limit when combined with another section. It's quite long. + +# Section 2 +This section is also substantial with a good amount of content that would exceed limits.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Should not merge if combined length would exceed maxChars significantly + // With the 1.5x flexibility for final chunks, they might merge if total < 150 chars + // Let's verify chunks are reasonably sized + chunks.forEach((chunk) => { + expect(chunk.content.length).toBeLessThanOrEqual(150); // 1.5x maxChars + }); + + // If chunks are merged, ensure it's within reasonable bounds + if (chunks.length === 1) { + expect(chunks[0].content.length).toBeLessThanOrEqual(150); + } + }); + + it('should handle the problematic formatting example', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 500, + minChars: 200, + overlap: 0, + headerLevels: [1, 2], + preserveCodeBlocks: true, + }); + + const text = `## Formatting and Debugging + +The \`core::fmt\` module provides functionality for formatting values. + +### Debug Trait + +The \`Debug\` trait is used for debug formatting. + +\`\`\`cairo +pub trait Debug +\`\`\` + +#### \`fmt\` Function + +The \`fmt\` function within the \`Debug\` trait is responsible for formatting. + +### Display Trait + +The \`Display\` trait is used for standard formatting.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Should create fewer, more substantial chunks + expect(chunks.length).toBeLessThanOrEqual(2); + + // Each chunk should be meaningful in size + chunks.forEach((chunk) => { + expect(chunk.content.length).toBeGreaterThan(100); + }); + }); + + it('should respect code block boundaries when merging', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 300, + minChars: 150, + overlap: 0, + headerLevels: [1, 2], + preserveCodeBlocks: true, + }); + + const text = `# Section 1 +Short intro. + +\`\`\`cairo +// This is a long code block +fn example() -> felt252 { + let x = 42; + let y = x * 2; + return y; +} +\`\`\` + +# Section 2 +Another short section.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Verify code blocks are not split + chunks.forEach((chunk) => { + const codeBlockMatches = chunk.content.match(/```/g) || []; + expect(codeBlockMatches.length % 2).toBe(0); + }); + }); +}); diff --git a/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.reconstruction.test.ts b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.reconstruction.test.ts new file mode 100644 index 00000000..b8be51f8 --- /dev/null +++ b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.reconstruction.test.ts @@ -0,0 +1,433 @@ +import { + RecursiveMarkdownSplitter, + SplitOptions, +} from '../RecursiveMarkdownSplitter'; + +describe('RecursiveMarkdownSplitter - Reconstruction Tests', () => { + /** + * These tests verify that when we split a document and then concatenate + * the chunks (excluding overlaps), we get back the original content. + * This ensures our splitting logic doesn't lose or duplicate content. + */ + + function reconstructFromChunks( + chunks: Array<{ + content: string; + start: number; + end: number; + overlapStart?: number; + }>, + original: string, + ): string { + if (chunks.length === 0) return ''; + + let result = ''; + let lastEnd = 0; + + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + + if (i === 0) { + // First chunk - use entire content + result = original.substring(chunk.start, chunk.end); + lastEnd = chunk.end; + } else if (chunk.overlapStart !== undefined) { + // Subsequent chunks with overlap - append only the non-overlapped portion + result += original.substring(chunk.overlapStart, chunk.end); + lastEnd = chunk.end; + } else { + // No overlap tracking - shouldn't happen but handle gracefully + result += original.substring(lastEnd, chunk.end); + lastEnd = chunk.end; + } + } + + return result; + } + + describe('Header splitting reconstruction', () => { + it('should reconstruct document with single header', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 10, + headerLevels: [1], + trim: false, // Important for exact reconstruction + }); + + const original = `# Header One +This is the first section with some content. + +More content in the first section.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + // Extract the raw chunks before metadata attachment + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with multiple headers at same level', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 60, + minChars: 0, + overlap: 15, + headerLevels: [1], + trim: false, + }); + + const original = `# First Section +Content for the first section goes here. + +# Second Section +Content for the second section goes here. + +# Third Section +Content for the third section goes here.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + // Extract raw chunks + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with nested headers', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 80, + minChars: 0, + overlap: 20, + headerLevels: [1, 2], + trim: false, + }); + + const original = `# Main Section +Introduction to the main section. + +## Subsection 1 +Details about subsection 1. + +## Subsection 2 +Details about subsection 2. + +# Another Main Section +Content for another main section.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with headers at start', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 40, + minChars: 0, + overlap: 10, + headerLevels: [1], + trim: false, + }); + + const original = `# Header at Start +Content immediately after header. + +More content here.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with content before first header', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 10, + headerLevels: [1], + trim: false, + }); + + const original = `Some preamble text before any headers. + +# First Header +Content under first header. + +# Second Header +Content under second header.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with consecutive headers', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 60, + minChars: 0, + overlap: 10, + headerLevels: [1, 2], + trim: false, + }); + + const original = `# Main Header +## Subheader 1 +## Subheader 2 +Content after headers. + +## Subheader 3 +More content.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + }); + + describe('Code block reconstruction', () => { + it('should reconstruct document with code blocks', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 60, + minChars: 0, + overlap: 15, + preserveCodeBlocks: true, + trim: false, + }); + + const original = `# Section with Code +Some text before code. + +\`\`\`python +def hello(): + print("Hello, World!") +\`\`\` + +Text after code block.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with large code block', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 10, + preserveCodeBlocks: true, + trim: false, + }); + + const original = `# Code Example +Here's a large code block: + +\`\`\`javascript +// This is a large code block that exceeds maxChars +function complexFunction() { + const result = performCalculation(); + return result; +} +\`\`\` + +Text after the code.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + }); + + describe('Complex document reconstruction', () => { + it('should reconstruct a complex markdown document', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 20, + overlap: 25, + headerLevels: [1, 2], + preserveCodeBlocks: true, + trim: false, + }); + + const original = `# Cairo Programming Guide + +Welcome to the Cairo programming guide. This document covers the basics. + +## Getting Started + +To get started with Cairo, you need to understand the fundamentals. + +### Installation + +First, install the Cairo compiler: + +\`\`\`bash +curl -L https://github.com/starkware-libs/cairo/releases/download/v2.0.0/cairo-lang-2.0.0.tar.gz | tar xz +cd cairo-lang-2.0.0 +./install.sh +\`\`\` + +### Your First Program + +Here's a simple Cairo program: + +\`\`\`cairo +fn main() { + let x = 1; + let y = 2; + assert(x + y == 3, 'Math is broken!'); +} +\`\`\` + +## Advanced Topics + +Once you understand the basics, you can explore advanced features. + +### Memory Management + +Cairo uses a unique memory model based on field elements. + +### Smart Contracts + +You can write smart contracts in Cairo for StarkNet. + +## Conclusion + +Cairo is a powerful language for writing provable programs.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + }); +}); diff --git a/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts new file mode 100644 index 00000000..67d93e68 --- /dev/null +++ b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts @@ -0,0 +1,544 @@ +import { + RecursiveMarkdownSplitter, + SplitOptions, + Chunk, +} from '../RecursiveMarkdownSplitter'; + +describe('RecursiveMarkdownSplitter', () => { + describe('Basic functionality', () => { + it('should handle empty input', () => { + const splitter = new RecursiveMarkdownSplitter(); + expect(splitter.splitMarkdownToChunks('')).toEqual([]); + expect(splitter.splitMarkdownToChunks(' ')).toEqual([]); + }); + + it('should handle single small chunk', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + const text = 'This is a small chunk of text.'; + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks).toHaveLength(1); + expect(chunks[0].content).toBe(text); + expect(chunks[0].meta.title).toBe('ROOT'); + expect(chunks[0].meta.chunkNumber).toBe(0); + }); + + it('should throw error when overlap >= maxChars', () => { + expect(() => { + new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 100, + }); + }).toThrow('Overlap (100) must be less than maxChars (100)'); + }); + }); + + describe('Header detection and splitting', () => { + it('should split on H1 headers', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + headerLevels: [1], + }); + + const text = `# First Section +This is the first section content. + +# Second Section +This is the second section content.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Headers split the content, so we should have chunks for each section + const firstSectionChunk = chunks.find( + (c) => c.meta.title === 'First Section', + ); + const secondSectionChunk = chunks.find( + (c) => c.meta.title === 'Second Section', + ); + + expect(firstSectionChunk).toBeDefined(); + expect(secondSectionChunk).toBeDefined(); + }); + + it('should split on both H1 and H2 headers', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + headerLevels: [1, 2], + }); + + const text = `# Main Section +Some intro text. + +## Subsection 1 +First subsection. + +## Subsection 2 +Second subsection.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks.length).toBeGreaterThanOrEqual(3); + expect(chunks[0].meta.title).toBe('Main Section'); + expect(chunks.find((c) => c.meta.title === 'Subsection 1')).toBeDefined(); + expect(chunks.find((c) => c.meta.title === 'Subsection 2')).toBeDefined(); + }); + + it('should ignore headers inside code blocks', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 200, + minChars: 0, + overlap: 0, + }); + + const text = `# Real Header +Some content. + +\`\`\`markdown +# This is not a real header +It's inside a code block +\`\`\` + +More content.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks).toHaveLength(1); + expect(chunks[0].meta.title).toBe('Real Header'); + expect(chunks[0].content).toContain('# This is not a real header'); + }); + + it('should handle headers with trailing hashes', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + const text = '## Header with trailing hashes ##\nContent here.'; + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks[0].meta.title).toBe('Header with trailing hashes'); + }); + }); + + describe('Code block handling', () => { + it('should not split inside code blocks', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + preserveCodeBlocks: true, + }); + + const text = `Some text before. + +\`\`\`python +def long_function(): + # This is a long code block that exceeds maxChars + print("This should not be split") + return "Even though it's longer than 50 chars" +\`\`\` + +Some text after.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Verify code block is kept intact + const codeBlockChunk = chunks.find((c) => + c.content.includes('def long_function()'), + ); + expect(codeBlockChunk).toBeDefined(); + expect(codeBlockChunk!.content).toContain('```python'); + expect(codeBlockChunk!.content).toContain('```'); + }); + + it('should handle tilde code fences', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 200, + minChars: 0, + overlap: 20, + }); + + const text = `Text before. + +~~~javascript +const code = "This uses tilde fences"; +~~~ + +Text after.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks).toHaveLength(1); + expect(chunks[0].content).toContain('~~~javascript'); + expect(chunks[0].content).toContain( + 'const code = "This uses tilde fences"', + ); + }); + + it('should handle nested code fences correctly', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 300, + minChars: 0, + overlap: 30, + }); + + const text = `\`\`\`markdown +Example with nested fences: +\`\`\`python +print("nested") +\`\`\` +End of example +\`\`\``; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks).toHaveLength(1); + expect(chunks[0].content).toContain('Example with nested fences'); + }); + }); + + describe('Overlap handling', () => { + it('should apply backward overlap correctly', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 10, + headerLevels: [1], + }); + + const text = `# Section 1 +This is the first section with some content. + +# Section 2 +This is the second section with more content.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks.length).toBeGreaterThanOrEqual(2); + + // Check that second chunk contains overlap from first + if (chunks.length >= 2) { + // The overlap should be at the beginning of the second chunk + const overlap = 10; // We set overlap to 10 + + // Calculate expected overlap position + const firstChunkEndIndex = chunks[0].meta.endChar; + const secondChunkStartIndex = chunks[1].meta.startChar; + + // The start of second chunk should be overlap chars before the end of first chunk + expect(firstChunkEndIndex - secondChunkStartIndex).toBeLessThanOrEqual( + overlap, + ); + } + }); + + it('should extend overlap to include entire code block', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 20, + preserveCodeBlocks: true, + }); + + const text = `First part of content here. + +\`\`\` +code block content +\`\`\` + +Second part starts here and continues with more text.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // If there are multiple chunks, verify code block handling + if (chunks.length > 1) { + const codeBlockInFirst = chunks[0].content.includes('```'); + const codeBlockInSecond = chunks[1].content.includes('```'); + + // Code block should be complete in whichever chunk it appears + if (codeBlockInFirst) { + expect(chunks[0].content).toMatch(/```[\s\S]*?```/); + } + if (codeBlockInSecond) { + expect(chunks[1].content).toMatch(/```[\s\S]*?```/); + } + } + }); + }); + + describe('Metadata generation', () => { + it('should generate correct unique IDs', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 5, + idPrefix: 'test', + }); + + const text = `# My Section +This is content for the first section + +# My Section +This is content for the second section with the same title`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Find all chunks with title "My Section" + const mySectionChunks = chunks.filter( + (c) => c.meta.title === 'My Section', + ); + + // Should have at least 2 chunks with this title + expect(mySectionChunks.length).toBeGreaterThanOrEqual(2); + + // Check that they have different unique IDs with incrementing numbers + const uniqueIds = mySectionChunks.map((c) => c.meta.uniqueId); + expect(uniqueIds).toContain('test-my-section-0'); + expect(uniqueIds).toContain('test-my-section-1'); + }); + + it('should track header paths correctly', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 10, + }); + + const text = `# Chapter 1 +Intro to chapter one with some text + +## Section 1.1 +Content in section one point one + +### Subsection 1.1.1 +More content in the subsection + +## Section 1.2 +Other content in section one point two`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // This should create multiple chunks due to the smaller maxChars + expect(chunks.length).toBeGreaterThan(1); + + // Find chunks based on their unique content + const section11Chunk = chunks.find((c) => + c.content.includes('section one point one'), + ); + const subsectionChunk = chunks.find((c) => + c.content.includes('More content in the subsection'), + ); + const section12Chunk = chunks.find((c) => + c.content.includes('section one point two'), + ); + + // Check that chunks have appropriate header paths + if (section11Chunk) { + expect(section11Chunk.meta.headerPath).toContain('Chapter 1'); + // Title should be Section 1.1 since that's the header for this content + expect(section11Chunk.meta.title).toBe('Section 1.1'); + } + + if (subsectionChunk) { + expect(subsectionChunk.meta.headerPath).toContain('Chapter 1'); + // The subsection content should have appropriate headers in path + expect( + subsectionChunk.meta.headerPath.some( + (h) => h === 'Section 1.1' || h === 'Subsection 1.1.1', + ), + ).toBe(true); + } + + if (section12Chunk) { + expect(section12Chunk.meta.headerPath).toContain('Chapter 1'); + expect(section12Chunk.meta.title).toBe('Section 1.2'); + } + }); + + it('should handle chunk numbering per title', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 30, + minChars: 0, + overlap: 0, + }); + + const text = `# Long Section +This is a very long section that will definitely need to be split into multiple chunks because it exceeds our maximum character limit.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + const longSectionChunks = chunks.filter( + (c) => c.meta.title === 'Long Section', + ); + expect(longSectionChunks.length).toBeGreaterThan(1); + + // Check sequential numbering + longSectionChunks.forEach((chunk, index) => { + expect(chunk.meta.chunkNumber).toBe(index); + }); + }); + + it('should slugify titles correctly', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 2048, + minChars: 0, + overlap: 256, + }); + + const text = `# Title with Special@#$ Characters!!! +Content`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks[0].meta.uniqueId).toBe('title-with-special-characters-0'); + }); + }); + + describe('Splitting strategies', () => { + it('should fall back to paragraph splitting', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + }); + + const text = `First paragraph with some content here. + +Second paragraph with more content here. + +Third paragraph with even more content.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks.length).toBeGreaterThanOrEqual(3); + }); + + it('should fall back to line splitting for very long lines', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + }); + + // Create multiple lines that are each long but don't have paragraph breaks + const longLine = + 'Line one that is quite long and exceeds our limit\n' + + 'Line two that is also very long and exceeds limit\n' + + 'Line three with even more text to ensure splitting'; + + const chunks = splitter.splitMarkdownToChunks(longLine); + + expect(chunks.length).toBeGreaterThan(1); + }); + }); + + describe('Edge cases', () => { + it('should handle documents with no headers', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + + const text = + 'Just plain text without any headers. ' + + 'This should still be chunked properly.'; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks.every((c) => c.meta.title === 'ROOT')).toBe(true); + }); + + it('should handle consecutive headers with no content', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + + const text = `# Header 1 +# Header 2 +# Header 3 +Some content here.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Should produce valid chunks even with empty sections + expect(chunks.length).toBeGreaterThan(0); + chunks.forEach((chunk) => { + expect(chunk.content.length).toBeGreaterThan(0); + }); + }); + + it('should handle Windows line endings', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + + const text = '# Header\r\nContent with\r\nWindows line endings.'; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks).toHaveLength(1); + expect(chunks[0].meta.title).toBe('Header'); + expect(chunks[0].content).not.toContain('\r'); + }); + + it('should handle unclosed code blocks gracefully', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + + const text = `# Section +Some content. + +\`\`\`python +This code block is never closed +and continues to the end`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks.length).toBeGreaterThan(0); + // Should still produce valid output + }); + }); + + describe('Character offset tracking', () => { + it('should track start and end character positions correctly', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + }); + + const text = `# Section 1 +Short content. + +# Section 2 +More content here.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + chunks.forEach((chunk) => { + expect(chunk.meta.startChar).toBeGreaterThanOrEqual(0); + expect(chunk.meta.endChar).toBeGreaterThan(chunk.meta.startChar); + expect( + chunk.meta.endChar - chunk.meta.startChar, + ).toBeGreaterThanOrEqual(chunk.content.length); + }); + }); + }); +}); diff --git a/python/src/cairo_coder/optimizers/mcp_optimizer.py b/python/src/cairo_coder/optimizers/mcp_optimizer.py index bb4ed581..28e9dc57 100644 --- a/python/src/cairo_coder/optimizers/mcp_optimizer.py +++ b/python/src/cairo_coder/optimizers/mcp_optimizer.py @@ -158,11 +158,12 @@ def forward(self, example, pred, trace=None): result = parallel(batches) resources_notes = [pred.resource_note for pred in result] - [pred.reasoning for pred in result] + reasonings = [pred.reasoning for pred in result] score = sum(resources_notes) / len(resources_notes) if len(resources_notes) != 0 else 0 - # for (note, reason) in zip(resources_notes, reasonings, strict=False): - # print(f"Note: {note}, reason: {reason}") + print(example.query) + for (note, reason) in zip(resources_notes, reasonings, strict=False): + print(f"Note: {note}, reason: {reason}") return score if trace is None else score >= self.threshold return (RetrievalF1,)