@@ -834,15 +834,18 @@ export class RecursiveMarkdownSplitter {
834834 const titleCounts = new Map < string , number > ( ) ;
835835
836836 for ( const rawChunk of rawChunks ) {
837- // Find the last header before or within this chunk that's in our configured levels
837+ // Determine title from the deepest configured header level that applies
838838 let title = 'ROOT' ;
839839 let headerPath : string [ ] = [ ] ;
840840
841- // Build full header path from all headers up to the end of this chunk
842- const allHeadersBeforeEnd = headers . filter ( ( h ) => h . start < rawChunk . end ) ;
841+ // Build full header path from all headers strictly before the end of this chunk
842+ // Do not include a header that starts exactly at the end boundary; it belongs to the next segment.
843+ const allHeadersBeforeOrAtEnd = headers . filter (
844+ ( h ) => h . start < rawChunk . end ,
845+ ) ;
843846 const headerStack : { level : number ; text : string } [ ] = [ ] ;
844847
845- for ( const header of allHeadersBeforeEnd ) {
848+ for ( const header of allHeadersBeforeOrAtEnd ) {
846849 // Pop headers from stack that are same or lower level
847850 while (
848851 headerStack . length > 0 &&
@@ -855,23 +858,23 @@ export class RecursiveMarkdownSplitter {
855858
856859 headerPath = headerStack . map ( ( h ) => h . text ) ;
857860
858- // Prefer the deepest header in the path (e.g., H3) for specificity
859- if ( headerPath . length > 0 ) {
860- title = headerPath [ headerPath . length - 1 ] ! ;
861- } else {
862- // Fallback: use last configured header before the chunk if any
863- for ( let i = headerStack . length - 1 ; i >= 0 ; i -- ) {
864- if (
865- this . options . headerLevels . includes (
866- headerStack [ i ] ! . level as 1 | 2 | 3 ,
867- )
868- ) {
869- title = headerStack [ i ] ! . text ;
870- break ;
871- }
861+ // Prefer the deepest header among the configured levels (e.g., H2 if [1,2])
862+ let preferredTitle : string | undefined ;
863+ for ( let i = headerStack . length - 1 ; i >= 0 ; i -- ) {
864+ const lvl = headerStack [ i ] ! . level as 1 | 2 | 3 ;
865+ if ( this . options . headerLevels . includes ( lvl ) ) {
866+ preferredTitle = headerStack [ i ] ! . text ;
867+ break ;
872868 }
873869 }
874870
871+ if ( preferredTitle ) {
872+ title = preferredTitle ;
873+ } else if ( headerStack . length > 0 ) {
874+ // Fallback to the deepest header regardless of level if none match configured levels
875+ title = headerStack [ headerStack . length - 1 ] ! . text ;
876+ }
877+
875878 // Track chunk numbers per title (0-based)
876879 const count = titleCounts . get ( title ) || 0 ;
877880 titleCounts . set ( title , count + 1 ) ;
@@ -882,16 +885,49 @@ export class RecursiveMarkdownSplitter {
882885 ? `${ this . options . idPrefix } -${ slug } -${ count } `
883886 : `${ slug } -${ count } ` ;
884887
885- // Determine sourceLink based on active source ranges: prefer segment start (no overlap)
888+ // Determine sourceLink based on active source ranges.
889+ // Strategy:
890+ // 1) Prefer a range that contains the anchor position (segment start if available, else chunk start)
891+ // 2) Otherwise, if any range starts within this chunk, select the last one (closest to chunk end)
892+ // 3) Otherwise, if any range overlaps this chunk at all, select the one with the latest start
886893 let sourceLink : string | undefined = undefined ;
887- const anchorPos = ( rawChunk as any ) . overlapStart ?? rawChunk . start ;
888894 if ( sourceRanges && sourceRanges . length > 0 ) {
889- const s = anchorPos as number ;
890- for ( const r of sourceRanges ) {
891- if ( s >= r . start && s < r . end ) {
892- sourceLink = r . url ;
893- break ;
895+ const anchorPos = ( rawChunk as any ) . overlapStart ?? rawChunk . start ;
896+
897+ // Step 1: range that contains anchor
898+ let active = sourceRanges . find (
899+ ( r ) => anchorPos >= r . start && anchorPos < r . end ,
900+ ) ;
901+
902+ // Step 2: range that starts within the chunk [start, end)
903+ if ( ! active ) {
904+ let candidate :
905+ | { start : number ; end : number ; url : string }
906+ | undefined ;
907+ for ( const r of sourceRanges ) {
908+ if ( r . start >= rawChunk . start && r . start < rawChunk . end ) {
909+ if ( ! candidate || r . start > candidate . start ) candidate = r ;
910+ }
894911 }
912+ if ( candidate ) active = candidate ;
913+ }
914+
915+ // Step 3: any overlapping range; choose the one with the latest start
916+ if ( ! active ) {
917+ let candidate :
918+ | { start : number ; end : number ; url : string }
919+ | undefined ;
920+ for ( const r of sourceRanges ) {
921+ const overlaps = r . start < rawChunk . end && r . end > rawChunk . start ;
922+ if ( overlaps ) {
923+ if ( ! candidate || r . start > candidate . start ) candidate = r ;
924+ }
925+ }
926+ if ( candidate ) active = candidate ;
927+ }
928+
929+ if ( active ) {
930+ sourceLink = active . url ;
895931 }
896932 }
897933
0 commit comments