Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions packages/ingester/__tests__/AsciiDocIngester.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ describe('AsciiDocIngester', () => {
fileExtension: '.adoc',
chunkSize: 1000,
chunkOverlap: 200,
baseUrl: 'https://example.com',
urlSuffix: '',
},
playbookPath: 'test-playbook.yml',
outputDir: '/tmp/output',
Expand Down
2 changes: 0 additions & 2 deletions packages/ingester/__tests__/MarkdownIngester.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ const markdownIngester = new TestMarkdownIngester(
{
repoOwner: 'test',
repoName: 'test',
baseUrl: 'https://test.com',
fileExtension: 'md',
urlSuffix: '.html',
chunkSize: 1000,
chunkOverlap: 100,
},
Expand Down
4 changes: 1 addition & 3 deletions packages/ingester/src/ingesters/AsciiDocIngester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,7 @@ export abstract class AsciiDocIngester extends BaseIngester {
chunkNumber: index,
contentHash: hash,
uniqueId: `${page.name}-${index}`,
sourceLink: `${this.config.baseUrl}/${page.name}${this.config.urlSuffix}${
section.anchor ? '#' + section.anchor : ''
}`,
sourceLink: ``,
source: this.source,
},
}),
Expand Down
42 changes: 38 additions & 4 deletions packages/ingester/src/ingesters/CairoBookIngester.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import * as path from 'path';
import { BookConfig } from '../utils/types';
import { BookConfig, BookPageDto } from '../utils/types';
import { MarkdownIngester } from './MarkdownIngester';
import { DocumentSource } from '@cairo-coder/agents/types/index';
import { BookChunk, DocumentSource } from '@cairo-coder/agents/types/index';
import { Document } from '@langchain/core/documents';
import { VectorStore } from '@cairo-coder/agents/db/postgresVectorStore';

/**
* Ingester for the Cairo Book documentation
Expand All @@ -21,13 +23,45 @@ export class CairoBookIngester extends MarkdownIngester {
fileExtension: '.md',
chunkSize: 4096,
chunkOverlap: 512,
baseUrl: 'https://book.cairo-lang.org',
urlSuffix: '.html',
};

super(config, DocumentSource.CAIRO_BOOK);
}

async downloadLLMSFullFile(): Promise<string> {
const url = 'https://book.cairo-lang.org/llms-full.txt';
const response = await fetch(url);
const text = await response.text();
return text;
}

async chunkLLMSFullFile(text: string): Promise<Document<BookChunk>[]> {
return super.createChunkFromPage("cairo-book", text);
}

/**
* Cairo-Book specific processing based on the LLMS full file - which is a sanitized version of
* the book for LLMs consumption, reducing the amount of noise in the corpus.
* @param vectorStore
*/
public async process(vectorStore: VectorStore): Promise<void> {
try {
// 1. Download and extract documentation
const text = await this.downloadLLMSFullFile();

// 2. Create chunks from the documentation
const chunks = await this.chunkLLMSFullFile(text);

// 3. Update the vector store with the chunks
await this.updateVectorStore(vectorStore, chunks);

// 4. Clean up any temporary files
await this.cleanupDownloadedFiles();
} catch (error) {
this.handleError(error);
}
}

/**
* Get the directory path for extracting files
*
Expand Down
2 changes: 0 additions & 2 deletions packages/ingester/src/ingesters/CairoByExampleIngester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ export class CairoByExampleIngester extends MarkdownIngester {
fileExtension: '.md',
chunkSize: 4096,
chunkOverlap: 512,
baseUrl: 'https://enitrat.github.io/cairo-by-example',
urlSuffix: '.html',
};

super(config, DocumentSource.CAIRO_BY_EXAMPLE);
Expand Down
32 changes: 19 additions & 13 deletions packages/ingester/src/ingesters/MarkdownIngester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,37 +103,43 @@ export abstract class MarkdownIngester extends BaseIngester {
const chunks: Document<BookChunk>[] = [];

for (const page of pages) {
const localChunks = this.createChunkFromPage(page.name, page.content);
chunks.push(...localChunks);
}
return chunks;
}

/**
* Create a chunk from a single page
*/
protected createChunkFromPage(page_name: string, page_content: string): Document<BookChunk>[] {
// Sanitize code blocks to avoid parsing issues
const sanitizedContent = this.sanitizeCodeBlocks(page.content);
const localChunks = []
const sanitizedContent = this.sanitizeCodeBlocks(page_content);

// Parse the page into sections
const sections = this.parsePage(sanitizedContent, true);

// Create a document for each section
sections.forEach((section: ParsedSection, index: number) => {
const hash: string = calculateHash(section.content);
chunks.push(
new Document<BookChunk>({
pageContent: section.content,
metadata: {
name: page.name,
localChunks.push(new Document<BookChunk>({
pageContent: section.content,
metadata: {
name: page_name,
title: section.title,
chunkNumber: index,
contentHash: hash,
uniqueId: `${page.name}-${index}`,
sourceLink: `${this.config.baseUrl}/${page.name}${this.config.urlSuffix}${
section.anchor ? '#' + section.anchor : ''
}`,
uniqueId: `${page_name}-${index}`,
sourceLink: ``,
source: this.source,
},
}),
);
});
return localChunks;
}

return chunks;
}

/**
* Clean up downloaded files
*/
Expand Down
2 changes: 0 additions & 2 deletions packages/ingester/src/ingesters/OpenZeppelinDocsIngester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ export class OpenZeppelinDocsIngester extends AsciiDocIngester {
fileExtension: '.adoc',
chunkSize: 4096,
chunkOverlap: 512,
baseUrl: 'https://docs.openzeppelin.com',
urlSuffix: '',
};

// Find the package root by looking for package.json
Expand Down
2 changes: 0 additions & 2 deletions packages/ingester/src/ingesters/StarknetDocsIngester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ export class StarknetDocsIngester extends AsciiDocIngester {
fileExtension: '.adoc',
chunkSize: 4096,
chunkOverlap: 512,
baseUrl: 'https://docs.starknet.io',
urlSuffix: '',
};

// Find the package root by looking for package.json
Expand Down
2 changes: 0 additions & 2 deletions packages/ingester/src/ingesters/StarknetFoundryIngester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ export class StarknetFoundryIngester extends MarkdownIngester {
fileExtension: '.md',
chunkSize: 4096,
chunkOverlap: 512,
baseUrl: 'https://foundry-rs.github.io/starknet-foundry',
urlSuffix: '.html',
};

super(config, DocumentSource.STARKNET_FOUNDRY);
Expand Down
1 change: 0 additions & 1 deletion packages/ingester/src/shared.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ export type BookConfig = {
fileExtension: string;
chunkSize: number;
chunkOverlap: number;
baseUrl: string;
};

/**
Expand Down
6 changes: 0 additions & 6 deletions packages/ingester/src/utils/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,6 @@ export type BookConfig = {

/** The overlap between chunks in characters */
chunkOverlap: number;

/** The base URL for the documentation */
baseUrl: string;

/** The suffix for the documentation files */
urlSuffix: string;
};

/**
Expand Down