From b8d4b2475995456dc332a185cd6e568a42449a09 Mon Sep 17 00:00:00 2001 From: enitrat Date: Wed, 21 May 2025 13:29:55 +0200 Subject: [PATCH] update cairobook ingester to use llms-full file --- .../__tests__/AsciiDocIngester.test.ts | 2 - .../__tests__/MarkdownIngester.test.ts | 2 - .../src/ingesters/AsciiDocIngester.ts | 4 +- .../src/ingesters/CairoBookIngester.ts | 42 +++++++++++++++++-- .../src/ingesters/CairoByExampleIngester.ts | 2 - .../src/ingesters/MarkdownIngester.ts | 32 ++++++++------ .../src/ingesters/OpenZeppelinDocsIngester.ts | 2 - .../src/ingesters/StarknetDocsIngester.ts | 2 - .../src/ingesters/StarknetFoundryIngester.ts | 2 - packages/ingester/src/shared.ts | 1 - packages/ingester/src/utils/types.ts | 6 --- 11 files changed, 58 insertions(+), 39 deletions(-) diff --git a/packages/ingester/__tests__/AsciiDocIngester.test.ts b/packages/ingester/__tests__/AsciiDocIngester.test.ts index cd1cb2eb..fed22935 100644 --- a/packages/ingester/__tests__/AsciiDocIngester.test.ts +++ b/packages/ingester/__tests__/AsciiDocIngester.test.ts @@ -73,8 +73,6 @@ describe('AsciiDocIngester', () => { fileExtension: '.adoc', chunkSize: 1000, chunkOverlap: 200, - baseUrl: 'https://example.com', - urlSuffix: '', }, playbookPath: 'test-playbook.yml', outputDir: '/tmp/output', diff --git a/packages/ingester/__tests__/MarkdownIngester.test.ts b/packages/ingester/__tests__/MarkdownIngester.test.ts index 1b8ed1a0..538e0378 100644 --- a/packages/ingester/__tests__/MarkdownIngester.test.ts +++ b/packages/ingester/__tests__/MarkdownIngester.test.ts @@ -15,9 +15,7 @@ const markdownIngester = new TestMarkdownIngester( { repoOwner: 'test', repoName: 'test', - baseUrl: 'https://test.com', fileExtension: 'md', - urlSuffix: '.html', chunkSize: 1000, chunkOverlap: 100, }, diff --git a/packages/ingester/src/ingesters/AsciiDocIngester.ts b/packages/ingester/src/ingesters/AsciiDocIngester.ts index 2c0a6a7f..d3e4b4b7 100644 --- a/packages/ingester/src/ingesters/AsciiDocIngester.ts +++ b/packages/ingester/src/ingesters/AsciiDocIngester.ts @@ -255,9 +255,7 @@ export abstract class AsciiDocIngester extends BaseIngester { chunkNumber: index, contentHash: hash, uniqueId: `${page.name}-${index}`, - sourceLink: `${this.config.baseUrl}/${page.name}${this.config.urlSuffix}${ - section.anchor ? '#' + section.anchor : '' - }`, + sourceLink: ``, source: this.source, }, }), diff --git a/packages/ingester/src/ingesters/CairoBookIngester.ts b/packages/ingester/src/ingesters/CairoBookIngester.ts index c1183eda..26764542 100644 --- a/packages/ingester/src/ingesters/CairoBookIngester.ts +++ b/packages/ingester/src/ingesters/CairoBookIngester.ts @@ -1,7 +1,9 @@ import * as path from 'path'; -import { BookConfig } from '../utils/types'; +import { BookConfig, BookPageDto } from '../utils/types'; import { MarkdownIngester } from './MarkdownIngester'; -import { DocumentSource } from '@cairo-coder/agents/types/index'; +import { BookChunk, DocumentSource } from '@cairo-coder/agents/types/index'; +import { Document } from '@langchain/core/documents'; +import { VectorStore } from '@cairo-coder/agents/db/postgresVectorStore'; /** * Ingester for the Cairo Book documentation @@ -21,13 +23,45 @@ export class CairoBookIngester extends MarkdownIngester { fileExtension: '.md', chunkSize: 4096, chunkOverlap: 512, - baseUrl: 'https://book.cairo-lang.org', - urlSuffix: '.html', }; super(config, DocumentSource.CAIRO_BOOK); } + async downloadLLMSFullFile(): Promise { + const url = 'https://book.cairo-lang.org/llms-full.txt'; + const response = await fetch(url); + const text = await response.text(); + return text; + } + + async chunkLLMSFullFile(text: string): Promise[]> { + return super.createChunkFromPage("cairo-book", text); + } + + /** + * Cairo-Book specific processing based on the LLMS full file - which is a sanitized version of + * the book for LLMs consumption, reducing the amount of noise in the corpus. + * @param vectorStore + */ + public async process(vectorStore: VectorStore): Promise { + try { + // 1. Download and extract documentation + const text = await this.downloadLLMSFullFile(); + + // 2. Create chunks from the documentation + const chunks = await this.chunkLLMSFullFile(text); + + // 3. Update the vector store with the chunks + await this.updateVectorStore(vectorStore, chunks); + + // 4. Clean up any temporary files + await this.cleanupDownloadedFiles(); + } catch (error) { + this.handleError(error); + } + } + /** * Get the directory path for extracting files * diff --git a/packages/ingester/src/ingesters/CairoByExampleIngester.ts b/packages/ingester/src/ingesters/CairoByExampleIngester.ts index 865c46cd..ba75cf23 100644 --- a/packages/ingester/src/ingesters/CairoByExampleIngester.ts +++ b/packages/ingester/src/ingesters/CairoByExampleIngester.ts @@ -21,8 +21,6 @@ export class CairoByExampleIngester extends MarkdownIngester { fileExtension: '.md', chunkSize: 4096, chunkOverlap: 512, - baseUrl: 'https://enitrat.github.io/cairo-by-example', - urlSuffix: '.html', }; super(config, DocumentSource.CAIRO_BY_EXAMPLE); diff --git a/packages/ingester/src/ingesters/MarkdownIngester.ts b/packages/ingester/src/ingesters/MarkdownIngester.ts index 4416914d..56377b82 100644 --- a/packages/ingester/src/ingesters/MarkdownIngester.ts +++ b/packages/ingester/src/ingesters/MarkdownIngester.ts @@ -103,8 +103,19 @@ export abstract class MarkdownIngester extends BaseIngester { const chunks: Document[] = []; for (const page of pages) { + const localChunks = this.createChunkFromPage(page.name, page.content); + chunks.push(...localChunks); + } + return chunks; + } + + /** + * Create a chunk from a single page + */ + protected createChunkFromPage(page_name: string, page_content: string): Document[] { // Sanitize code blocks to avoid parsing issues - const sanitizedContent = this.sanitizeCodeBlocks(page.content); + const localChunks = [] + const sanitizedContent = this.sanitizeCodeBlocks(page_content); // Parse the page into sections const sections = this.parsePage(sanitizedContent, true); @@ -112,28 +123,23 @@ export abstract class MarkdownIngester extends BaseIngester { // Create a document for each section sections.forEach((section: ParsedSection, index: number) => { const hash: string = calculateHash(section.content); - chunks.push( - new Document({ - pageContent: section.content, - metadata: { - name: page.name, + localChunks.push(new Document({ + pageContent: section.content, + metadata: { + name: page_name, title: section.title, chunkNumber: index, contentHash: hash, - uniqueId: `${page.name}-${index}`, - sourceLink: `${this.config.baseUrl}/${page.name}${this.config.urlSuffix}${ - section.anchor ? '#' + section.anchor : '' - }`, + uniqueId: `${page_name}-${index}`, + sourceLink: ``, source: this.source, }, }), ); }); + return localChunks; } - return chunks; - } - /** * Clean up downloaded files */ diff --git a/packages/ingester/src/ingesters/OpenZeppelinDocsIngester.ts b/packages/ingester/src/ingesters/OpenZeppelinDocsIngester.ts index a27b1289..75ada29f 100644 --- a/packages/ingester/src/ingesters/OpenZeppelinDocsIngester.ts +++ b/packages/ingester/src/ingesters/OpenZeppelinDocsIngester.ts @@ -29,8 +29,6 @@ export class OpenZeppelinDocsIngester extends AsciiDocIngester { fileExtension: '.adoc', chunkSize: 4096, chunkOverlap: 512, - baseUrl: 'https://docs.openzeppelin.com', - urlSuffix: '', }; // Find the package root by looking for package.json diff --git a/packages/ingester/src/ingesters/StarknetDocsIngester.ts b/packages/ingester/src/ingesters/StarknetDocsIngester.ts index 990e82c5..e24c5ca3 100644 --- a/packages/ingester/src/ingesters/StarknetDocsIngester.ts +++ b/packages/ingester/src/ingesters/StarknetDocsIngester.ts @@ -24,8 +24,6 @@ export class StarknetDocsIngester extends AsciiDocIngester { fileExtension: '.adoc', chunkSize: 4096, chunkOverlap: 512, - baseUrl: 'https://docs.starknet.io', - urlSuffix: '', }; // Find the package root by looking for package.json diff --git a/packages/ingester/src/ingesters/StarknetFoundryIngester.ts b/packages/ingester/src/ingesters/StarknetFoundryIngester.ts index f977c8de..634f705a 100644 --- a/packages/ingester/src/ingesters/StarknetFoundryIngester.ts +++ b/packages/ingester/src/ingesters/StarknetFoundryIngester.ts @@ -34,8 +34,6 @@ export class StarknetFoundryIngester extends MarkdownIngester { fileExtension: '.md', chunkSize: 4096, chunkOverlap: 512, - baseUrl: 'https://foundry-rs.github.io/starknet-foundry', - urlSuffix: '.html', }; super(config, DocumentSource.STARKNET_FOUNDRY); diff --git a/packages/ingester/src/shared.ts b/packages/ingester/src/shared.ts index c31227c4..86899a65 100644 --- a/packages/ingester/src/shared.ts +++ b/packages/ingester/src/shared.ts @@ -19,7 +19,6 @@ export type BookConfig = { fileExtension: string; chunkSize: number; chunkOverlap: number; - baseUrl: string; }; /** diff --git a/packages/ingester/src/utils/types.ts b/packages/ingester/src/utils/types.ts index f4fb261b..f92000b7 100644 --- a/packages/ingester/src/utils/types.ts +++ b/packages/ingester/src/utils/types.ts @@ -38,12 +38,6 @@ export type BookConfig = { /** The overlap between chunks in characters */ chunkOverlap: number; - - /** The base URL for the documentation */ - baseUrl: string; - - /** The suffix for the documentation files */ - urlSuffix: string; }; /**