docs: refill corelib docs

enitrat · enitrat · commit 2c44080e4191 · 2025-10-05T17:57:03.000+02:00
diff --git a/ingesters/src/ingesters/CoreLibDocsIngester.ts b/ingesters/src/ingesters/CoreLibDocsIngester.ts
@@ -33,7 +33,7 @@ export class CoreLibDocsIngester extends MarkdownIngester {
       chunkOverlap: 512,
       baseUrl: 'https://docs.starknet.io/build/corelib/intro',
       urlSuffix: '',
-      useUrlMapping: false,
+      useUrlMapping: true,
     };
 
     super(config, DocumentSource.CORELIB_DOCS);
diff --git a/ingesters/src/utils/RecursiveMarkdownSplitter.ts b/ingesters/src/utils/RecursiveMarkdownSplitter.ts
@@ -834,15 +834,18 @@ export class RecursiveMarkdownSplitter {
     const titleCounts = new Map<string, number>();
 
     for (const rawChunk of rawChunks) {
-      // Find the last header before or within this chunk that's in our configured levels
+      // Determine title from the deepest configured header level that applies
       let title = 'ROOT';
       let headerPath: string[] = [];
 
-      // Build full header path from all headers up to the end of this chunk
-      const allHeadersBeforeEnd = headers.filter((h) => h.start < rawChunk.end);
+      // Build full header path from all headers strictly before the end of this chunk
+      // Do not include a header that starts exactly at the end boundary; it belongs to the next segment.
+      const allHeadersBeforeOrAtEnd = headers.filter(
+        (h) => h.start < rawChunk.end,
+      );
       const headerStack: { level: number; text: string }[] = [];
 
-      for (const header of allHeadersBeforeEnd) {
+      for (const header of allHeadersBeforeOrAtEnd) {
         // Pop headers from stack that are same or lower level
         while (
           headerStack.length > 0 &&
@@ -855,23 +858,23 @@ export class RecursiveMarkdownSplitter {
 
       headerPath = headerStack.map((h) => h.text);
 
-      // Prefer the deepest header in the path (e.g., H3) for specificity
-      if (headerPath.length > 0) {
-        title = headerPath[headerPath.length - 1]!;
-      } else {
-        // Fallback: use last configured header before the chunk if any
-        for (let i = headerStack.length - 1; i >= 0; i--) {
-          if (
-            this.options.headerLevels.includes(
-              headerStack[i]!.level as 1 | 2 | 3,
-            )
-          ) {
-            title = headerStack[i]!.text;
-            break;
-          }
+      // Prefer the deepest header among the configured levels (e.g., H2 if [1,2])
+      let preferredTitle: string | undefined;
+      for (let i = headerStack.length - 1; i >= 0; i--) {
+        const lvl = headerStack[i]!.level as 1 | 2 | 3;
+        if (this.options.headerLevels.includes(lvl)) {
+          preferredTitle = headerStack[i]!.text;
+          break;
         }
       }
 
+      if (preferredTitle) {
+        title = preferredTitle;
+      } else if (headerStack.length > 0) {
+        // Fallback to the deepest header regardless of level if none match configured levels
+        title = headerStack[headerStack.length - 1]!.text;
+      }
+
       // Track chunk numbers per title (0-based)
       const count = titleCounts.get(title) || 0;
       titleCounts.set(title, count + 1);
@@ -882,16 +885,49 @@ export class RecursiveMarkdownSplitter {
         ? `${this.options.idPrefix}-${slug}-${count}`
         : `${slug}-${count}`;
 
-      // Determine sourceLink based on active source ranges: prefer segment start (no overlap)
+      // Determine sourceLink based on active source ranges.
+      // Strategy:
+      // 1) Prefer a range that contains the anchor position (segment start if available, else chunk start)
+      // 2) Otherwise, if any range starts within this chunk, select the last one (closest to chunk end)
+      // 3) Otherwise, if any range overlaps this chunk at all, select the one with the latest start
       let sourceLink: string | undefined = undefined;
-      const anchorPos = (rawChunk as any).overlapStart ?? rawChunk.start;
       if (sourceRanges && sourceRanges.length > 0) {
-        const s = anchorPos as number;
-        for (const r of sourceRanges) {
-          if (s >= r.start && s < r.end) {
-            sourceLink = r.url;
-            break;
+        const anchorPos = (rawChunk as any).overlapStart ?? rawChunk.start;
+
+        // Step 1: range that contains anchor
+        let active = sourceRanges.find(
+          (r) => anchorPos >= r.start && anchorPos < r.end,
+        );
+
+        // Step 2: range that starts within the chunk [start, end)
+        if (!active) {
+          let candidate:
+            | { start: number; end: number; url: string }
+            | undefined;
+          for (const r of sourceRanges) {
+            if (r.start >= rawChunk.start && r.start < rawChunk.end) {
+              if (!candidate || r.start > candidate.start) candidate = r;
+            }
           }
+          if (candidate) active = candidate;
+        }
+
+        // Step 3: any overlapping range; choose the one with the latest start
+        if (!active) {
+          let candidate:
+            | { start: number; end: number; url: string }
+            | undefined;
+          for (const r of sourceRanges) {
+            const overlaps = r.start < rawChunk.end && r.end > rawChunk.start;
+            if (overlaps) {
+              if (!candidate || r.start > candidate.start) candidate = r;
+            }
+          }
+          if (candidate) active = candidate;
+        }
+
+        if (active) {
+          sourceLink = active.url;
         }
       }
 
diff --git a/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts b/ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts
@@ -124,7 +124,7 @@ More content.`;
       expect(chunks[0]!.meta.title).toBe('Header with trailing hashes');
     });
 
-    it('should prefer deepest header (e.g., H3) for title', () => {
+    it('should prefer deepest header of configured levels (e.g., H2) for title', () => {
       const splitter = new RecursiveMarkdownSplitter({
         maxChars: 80,
         minChars: 0,
@@ -134,14 +134,23 @@ More content.`;
 
       const text = `# Chapter
 Intro
-
+## Some H2 Title
+Some text in the H2
 ### Specific Topic
 Detailed text that should belong to the H3.`;
 
       const chunks = splitter.splitMarkdownToChunks(text);
       expect(chunks.length).toBeGreaterThan(0);
-      // Title should be the deepest header in headerPath -> H3
-      expect(chunks[0]!.meta.title).toBe('Specific Topic');
+      // Find a chunk that belongs to the H2 section
+      const h2Chunk = chunks.find(
+        (c) =>
+          c.content.includes('Some text in the H2') ||
+          c.content.includes('Specific Topic') ||
+          c.content.includes('Detailed text'),
+      );
+      expect(h2Chunk).toBeDefined();
+      // Title should be the deepest header among configured levels -> H2
+      expect(h2Chunk!.meta.title).toBe('Some H2 Title');
     });
   });
 
diff --git a/python/src/scripts/docs_crawler.py b/python/src/scripts/docs_crawler.py
@@ -266,11 +266,26 @@ def extract_content(self, html: str, url: str) -> tuple[str, str]:
         title_tag = soup.find('title')
         title = title_tag.get_text(strip=True) if title_tag else urlparse(url).path
 
-        # Remove boilerplate elements
+        # Remove boilerplate elements by tag name
         for tag in soup.find_all(['script', 'style', 'noscript', 'nav',
                                  'header', 'footer', 'aside', 'img', 'svg', 'iframe']):
             tag.decompose()
 
+        # Remove elements with IDs or classes containing boilerplate keywords
+        boilerplate_keywords = ['navbar', 'sidebar', 'nav-bar', 'side-bar', 'menu', 'toc', 'breadcrumb']
+        # Collect tags to remove first, then decompose them
+        tags_to_remove = []
+        for tag in soup.find_all(True):  # Find all tags
+            tag_id = tag.get('id', '').lower()
+            tag_classes = ' '.join(tag.get('class', [])).lower()
+
+            if any(keyword in tag_id or keyword in tag_classes for keyword in boilerplate_keywords):
+                tags_to_remove.append(tag)
+
+        # Now decompose all collected tags
+        for tag in tags_to_remove:
+            tag.decompose()
+
         # Try to find main content
         main_content = None
 
@@ -332,7 +347,6 @@ def compile_markdown(self) -> str:
         lines = [
             f"# {self.domain} — Snapshot ({date_str})",
             "",
-            "Clean documentation content extracted from sitemap.",
             "",
             "---",
             ""
diff --git a/python/src/scripts/summarizer/generated/corelib_summary.md b/python/src/scripts/summarizer/generated/corelib_summary.md