Skip to content

Commit 2c44080

Browse files
committed
docs: refill corelib docs
1 parent f7383c6 commit 2c44080

File tree

5 files changed

+7014
-4625
lines changed

5 files changed

+7014
-4625
lines changed

ingesters/src/ingesters/CoreLibDocsIngester.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ export class CoreLibDocsIngester extends MarkdownIngester {
3333
chunkOverlap: 512,
3434
baseUrl: 'https://docs.starknet.io/build/corelib/intro',
3535
urlSuffix: '',
36-
useUrlMapping: false,
36+
useUrlMapping: true,
3737
};
3838

3939
super(config, DocumentSource.CORELIB_DOCS);

ingesters/src/utils/RecursiveMarkdownSplitter.ts

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -834,15 +834,18 @@ export class RecursiveMarkdownSplitter {
834834
const titleCounts = new Map<string, number>();
835835

836836
for (const rawChunk of rawChunks) {
837-
// Find the last header before or within this chunk that's in our configured levels
837+
// Determine title from the deepest configured header level that applies
838838
let title = 'ROOT';
839839
let headerPath: string[] = [];
840840

841-
// Build full header path from all headers up to the end of this chunk
842-
const allHeadersBeforeEnd = headers.filter((h) => h.start < rawChunk.end);
841+
// Build full header path from all headers strictly before the end of this chunk
842+
// Do not include a header that starts exactly at the end boundary; it belongs to the next segment.
843+
const allHeadersBeforeOrAtEnd = headers.filter(
844+
(h) => h.start < rawChunk.end,
845+
);
843846
const headerStack: { level: number; text: string }[] = [];
844847

845-
for (const header of allHeadersBeforeEnd) {
848+
for (const header of allHeadersBeforeOrAtEnd) {
846849
// Pop headers from stack that are same or lower level
847850
while (
848851
headerStack.length > 0 &&
@@ -855,23 +858,23 @@ export class RecursiveMarkdownSplitter {
855858

856859
headerPath = headerStack.map((h) => h.text);
857860

858-
// Prefer the deepest header in the path (e.g., H3) for specificity
859-
if (headerPath.length > 0) {
860-
title = headerPath[headerPath.length - 1]!;
861-
} else {
862-
// Fallback: use last configured header before the chunk if any
863-
for (let i = headerStack.length - 1; i >= 0; i--) {
864-
if (
865-
this.options.headerLevels.includes(
866-
headerStack[i]!.level as 1 | 2 | 3,
867-
)
868-
) {
869-
title = headerStack[i]!.text;
870-
break;
871-
}
861+
// Prefer the deepest header among the configured levels (e.g., H2 if [1,2])
862+
let preferredTitle: string | undefined;
863+
for (let i = headerStack.length - 1; i >= 0; i--) {
864+
const lvl = headerStack[i]!.level as 1 | 2 | 3;
865+
if (this.options.headerLevels.includes(lvl)) {
866+
preferredTitle = headerStack[i]!.text;
867+
break;
872868
}
873869
}
874870

871+
if (preferredTitle) {
872+
title = preferredTitle;
873+
} else if (headerStack.length > 0) {
874+
// Fallback to the deepest header regardless of level if none match configured levels
875+
title = headerStack[headerStack.length - 1]!.text;
876+
}
877+
875878
// Track chunk numbers per title (0-based)
876879
const count = titleCounts.get(title) || 0;
877880
titleCounts.set(title, count + 1);
@@ -882,16 +885,49 @@ export class RecursiveMarkdownSplitter {
882885
? `${this.options.idPrefix}-${slug}-${count}`
883886
: `${slug}-${count}`;
884887

885-
// Determine sourceLink based on active source ranges: prefer segment start (no overlap)
888+
// Determine sourceLink based on active source ranges.
889+
// Strategy:
890+
// 1) Prefer a range that contains the anchor position (segment start if available, else chunk start)
891+
// 2) Otherwise, if any range starts within this chunk, select the last one (closest to chunk end)
892+
// 3) Otherwise, if any range overlaps this chunk at all, select the one with the latest start
886893
let sourceLink: string | undefined = undefined;
887-
const anchorPos = (rawChunk as any).overlapStart ?? rawChunk.start;
888894
if (sourceRanges && sourceRanges.length > 0) {
889-
const s = anchorPos as number;
890-
for (const r of sourceRanges) {
891-
if (s >= r.start && s < r.end) {
892-
sourceLink = r.url;
893-
break;
895+
const anchorPos = (rawChunk as any).overlapStart ?? rawChunk.start;
896+
897+
// Step 1: range that contains anchor
898+
let active = sourceRanges.find(
899+
(r) => anchorPos >= r.start && anchorPos < r.end,
900+
);
901+
902+
// Step 2: range that starts within the chunk [start, end)
903+
if (!active) {
904+
let candidate:
905+
| { start: number; end: number; url: string }
906+
| undefined;
907+
for (const r of sourceRanges) {
908+
if (r.start >= rawChunk.start && r.start < rawChunk.end) {
909+
if (!candidate || r.start > candidate.start) candidate = r;
910+
}
894911
}
912+
if (candidate) active = candidate;
913+
}
914+
915+
// Step 3: any overlapping range; choose the one with the latest start
916+
if (!active) {
917+
let candidate:
918+
| { start: number; end: number; url: string }
919+
| undefined;
920+
for (const r of sourceRanges) {
921+
const overlaps = r.start < rawChunk.end && r.end > rawChunk.start;
922+
if (overlaps) {
923+
if (!candidate || r.start > candidate.start) candidate = r;
924+
}
925+
}
926+
if (candidate) active = candidate;
927+
}
928+
929+
if (active) {
930+
sourceLink = active.url;
895931
}
896932
}
897933

ingesters/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ More content.`;
124124
expect(chunks[0]!.meta.title).toBe('Header with trailing hashes');
125125
});
126126

127-
it('should prefer deepest header (e.g., H3) for title', () => {
127+
it('should prefer deepest header of configured levels (e.g., H2) for title', () => {
128128
const splitter = new RecursiveMarkdownSplitter({
129129
maxChars: 80,
130130
minChars: 0,
@@ -134,14 +134,23 @@ More content.`;
134134

135135
const text = `# Chapter
136136
Intro
137-
137+
## Some H2 Title
138+
Some text in the H2
138139
### Specific Topic
139140
Detailed text that should belong to the H3.`;
140141

141142
const chunks = splitter.splitMarkdownToChunks(text);
142143
expect(chunks.length).toBeGreaterThan(0);
143-
// Title should be the deepest header in headerPath -> H3
144-
expect(chunks[0]!.meta.title).toBe('Specific Topic');
144+
// Find a chunk that belongs to the H2 section
145+
const h2Chunk = chunks.find(
146+
(c) =>
147+
c.content.includes('Some text in the H2') ||
148+
c.content.includes('Specific Topic') ||
149+
c.content.includes('Detailed text'),
150+
);
151+
expect(h2Chunk).toBeDefined();
152+
// Title should be the deepest header among configured levels -> H2
153+
expect(h2Chunk!.meta.title).toBe('Some H2 Title');
145154
});
146155
});
147156

python/src/scripts/docs_crawler.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,11 +266,26 @@ def extract_content(self, html: str, url: str) -> tuple[str, str]:
266266
title_tag = soup.find('title')
267267
title = title_tag.get_text(strip=True) if title_tag else urlparse(url).path
268268

269-
# Remove boilerplate elements
269+
# Remove boilerplate elements by tag name
270270
for tag in soup.find_all(['script', 'style', 'noscript', 'nav',
271271
'header', 'footer', 'aside', 'img', 'svg', 'iframe']):
272272
tag.decompose()
273273

274+
# Remove elements with IDs or classes containing boilerplate keywords
275+
boilerplate_keywords = ['navbar', 'sidebar', 'nav-bar', 'side-bar', 'menu', 'toc', 'breadcrumb']
276+
# Collect tags to remove first, then decompose them
277+
tags_to_remove = []
278+
for tag in soup.find_all(True): # Find all tags
279+
tag_id = tag.get('id', '').lower()
280+
tag_classes = ' '.join(tag.get('class', [])).lower()
281+
282+
if any(keyword in tag_id or keyword in tag_classes for keyword in boilerplate_keywords):
283+
tags_to_remove.append(tag)
284+
285+
# Now decompose all collected tags
286+
for tag in tags_to_remove:
287+
tag.decompose()
288+
274289
# Try to find main content
275290
main_content = None
276291

@@ -332,7 +347,6 @@ def compile_markdown(self) -> str:
332347
lines = [
333348
f"# {self.domain} — Snapshot ({date_str})",
334349
"",
335-
"Clean documentation content extracted from sitemap.",
336350
"",
337351
"---",
338352
""

0 commit comments

Comments
 (0)