diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..085d16d2 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,111 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Cairo Coder is an open-source Cairo language code generation service using Retrieval-Augmented Generation (RAG) to transform natural language requests into functional Cairo smart contracts and programs. It was adapted from the Starknet Agent project. + +## Essential Commands + +### Development + +- `pnpm install` - Install dependencies (requires Node.js 20+ and pnpm 9+) +- `pnpm dev` - Start all services in development mode with hot reload +- `pnpm build` - Build all packages for production +- `pnpm clean` - Clean package build files +- `pnpm clean:all` - Clean all build files and node_modules + +### Testing + +- `pnpm test` - Run all tests across packages +- `pnpm --filter @cairo-coder/agents test` - Run tests for specific package +- `pnpm --filter @cairo-coder/agents test -- -t "test name"` - Run specific test +- `pnpm --filter @cairo-coder/backend check-types` - Type check specific package + +### Documentation Ingestion + +- `pnpm generate-embeddings` - Interactive ingestion of documentation sources +- `pnpm generate-embeddings:yes` - Non-interactive ingestion (for CI/CD) + +### Docker Operations + +- `docker compose up postgres backend` - Start main services +- `docker compose up ingester` - Run documentation ingestion + +## High-Level Architecture + +### Monorepo Structure + +- **packages/agents**: Core RAG pipeline orchestrating query processing, document retrieval, and code generation +- **packages/backend**: Express API server providing OpenAI-compatible endpoints +- **packages/ingester**: Documentation processing system using template method pattern +- **packages/typescript-config**: Shared TypeScript configuration + +### Key Design Patterns + +1. **RAG Pipeline** (packages/agents/src/core/pipeline/): + + - `QueryProcessor`: Reformulates user queries for better retrieval + - `DocumentRetriever`: Searches pgvector database using similarity measures + - `AnswerGenerator`: Generates Cairo code from retrieved documents + - `McpPipeline`: Special mode returning raw documents without generation + +2. **Ingester System** (packages/ingester/src/ingesters/): + + - `BaseIngester`: Abstract class implementing template method pattern + - Source-specific ingesters extend base class for each documentation source + - Factory pattern (`IngesterFactory`) creates appropriate ingester instances + +3. **Multi-Provider LLM Support**: + - Configurable providers: OpenAI, Anthropic, Google Gemini + - Provider abstraction in agents package handles model differences + - Streaming and non-streaming response modes + +### Configuration + +- Copy `packages/agents/sample.config.toml` to `config.toml` +- Required configurations: + - LLM provider API keys (OPENAI, GEMINI, ANTHROPIC) + - Database connection in [VECTOR_DB] section + - Model selection in [PROVIDERS] section +- Environment variables: + - Root `.env`: PostgreSQL initialization (POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_DB) + - `packages/backend/.env`: Optional LangSmith tracing configuration + +### Database Architecture + +- PostgreSQL with pgvector extension for vector similarity search +- Embedding storage for documentation chunks +- Configurable similarity measures (cosine, dot product, euclidean) + +## Development Guidelines + +### Code Organization + +- Follow existing patterns in neighboring files +- Use dependency injection for testability +- Mock external dependencies (LLMs, databases) in tests +- Prefer editing existing files over creating new ones +- Follow template method pattern for new ingesters + +### Testing Approach + +- Jest for all testing +- Test files in `__tests__/` directories +- Mock LLM calls and database operations +- Test each ingester implementation separately +- Use descriptive test names explaining behavior + +### Adding New Documentation Sources + +1. Create new ingester extending `BaseIngester` in packages/ingester/src/ingesters/ +2. Implement required abstract methods +3. Register in `IngesterFactory` +4. Update configuration if needed + +### MCP (Model Context Protocol) Mode + +- Special mode activated via `x-mcp-mode: true` header +- Returns raw documentation chunks without LLM generation +- Useful for integration with other tools needing Cairo documentation diff --git a/packages/agents/src/config/agent.ts b/packages/agents/src/config/agent.ts index 77ca48d4..189982d0 100644 --- a/packages/agents/src/config/agent.ts +++ b/packages/agents/src/config/agent.ts @@ -4,9 +4,7 @@ import { basicTestTemplate } from './templates/testTemplate'; import { VectorStore } from '../db/postgresVectorStore'; import { DocumentSource, RagSearchConfig } from '../types'; -export const getAgentConfig = ( - vectorStore: VectorStore, -): RagSearchConfig => { +export const getAgentConfig = (vectorStore: VectorStore): RagSearchConfig => { return { name: 'Cairo Coder', prompts: cairoCoderPrompts, @@ -19,6 +17,9 @@ export const getAgentConfig = ( DocumentSource.CAIRO_BOOK, DocumentSource.CAIRO_BY_EXAMPLE, DocumentSource.STARKNET_FOUNDRY, + DocumentSource.CORELIB_DOCS, + DocumentSource.OPENZEPPELIN_DOCS, + DocumentSource.SCARB_DOCS, ], }; }; diff --git a/packages/agents/src/config/prompts/cairoCoderPrompts.ts b/packages/agents/src/config/prompts/cairoCoderPrompts.ts index 0562ab89..4cbef25f 100644 --- a/packages/agents/src/config/prompts/cairoCoderPrompts.ts +++ b/packages/agents/src/config/prompts/cairoCoderPrompts.ts @@ -29,6 +29,7 @@ You will be given a conversation history and a follow-up question. Your primary * **cairo_by_example:** Cairo by Example Documentation. Provides practical Cairo code snippets for specific language features or common patterns. Useful for "how-to" syntax questions. * **openzeppelin_docs:** OpenZeppelin Cairo Contracts Documentation. For using the OZ library: standard implementations (ERC20, ERC721), access control, security patterns, contract upgradeability. Crucial for building standard-compliant contracts. * **corelib_docs:** Cairo Core Library Documentation. For using the Cairo core library: basic types, stdlib functions, stdlib structs, macros, and other core concepts. Essential for Cairo programming questions. +* **scarb_docs:** Scarb Documentation. For using the Scarb package manager: building, compiling, generating compilation artifacts, managing dependencies, configuration of Scarb.toml. **Examples:** diff --git a/packages/agents/src/core/pipeline/documentRetriever.ts b/packages/agents/src/core/pipeline/documentRetriever.ts index 8256c9a6..f1e44561 100644 --- a/packages/agents/src/core/pipeline/documentRetriever.ts +++ b/packages/agents/src/core/pipeline/documentRetriever.ts @@ -54,7 +54,11 @@ export class DocumentRetriever { ].map( (content) => results.flat().find((doc) => doc.pageContent === content)!, ); - logger.debug('Retrieved documents:', { count: uniqueDocs.length }); + const sourceSet = new Set(uniqueDocs.map((doc) => doc.metadata.source)); + logger.debug('Retrieved documents:', { + count: uniqueDocs.length, + sources: Array.from(sourceSet), + }); return uniqueDocs; } diff --git a/packages/agents/src/core/pipeline/mcpPipeline.ts b/packages/agents/src/core/pipeline/mcpPipeline.ts index 2621da9d..66533c30 100644 --- a/packages/agents/src/core/pipeline/mcpPipeline.ts +++ b/packages/agents/src/core/pipeline/mcpPipeline.ts @@ -1,5 +1,5 @@ import { RagPipeline } from './ragPipeline'; -import { RagInput, StreamHandler } from '../../types'; +import { RagInput, RetrievedDocuments, StreamHandler } from '../../types'; import { logger, TokenTracker } from '../../utils'; /** @@ -14,7 +14,7 @@ export class McpPipeline extends RagPipeline { try { // Reset token counters at the start of each pipeline run TokenTracker.resetSessionCounters(); - + logger.info('Starting MCP pipeline', { query: input.query }); // Step 1: Process the query @@ -30,33 +30,54 @@ export class McpPipeline extends RagPipeline { // Step 3: Return raw documents without answer generation logger.info('MCP mode - returning raw documents'); - - const rawDocuments = retrieved.documents.map(doc => ({ - pageContent: doc.pageContent, - metadata: doc.metadata - })); + + const context = this.assembleDocuments(retrieved); handler.emitResponse({ - content: JSON.stringify(rawDocuments, null, 2), + content: JSON.stringify(context, null, 2), } as any); logger.debug('MCP pipeline ended'); - + // Log final token usage const tokenUsage = TokenTracker.getSessionTokenUsage(); - logger.info('MCP Pipeline completed', { + logger.info('MCP Pipeline completed', { query: input.query, tokenUsage: { promptTokens: tokenUsage.promptTokens, responseTokens: tokenUsage.responseTokens, - totalTokens: tokenUsage.totalTokens - } + totalTokens: tokenUsage.totalTokens, + }, }); - + handler.emitEnd(); } catch (error) { logger.error('MCP Pipeline error:', error); handler.emitError('An error occurred while processing your request'); } } -} \ No newline at end of file + + public assembleDocuments(retrieved: RetrievedDocuments): string { + const docs = retrieved.documents; + if (!docs.length) { + return ( + this.config.prompts.noSourceFoundPrompt || + 'No relevant information found.' + ); + } + + // Concatenate all document content into a single string + let context = docs.map((doc) => doc.pageContent).join('\n\n'); + + // Add contract and test templates at the end if applicable + const { isContractRelated, isTestRelated } = retrieved.processedQuery; + if (isContractRelated && this.config.contractTemplate) { + context += '\n\n' + this.config.contractTemplate; + } + if (isTestRelated && this.config.testTemplate) { + context += '\n\n' + this.config.testTemplate; + } + + return context; + } +} diff --git a/packages/agents/src/types/index.ts b/packages/agents/src/types/index.ts index 7e281d1b..3416e825 100644 --- a/packages/agents/src/types/index.ts +++ b/packages/agents/src/types/index.ts @@ -103,6 +103,7 @@ export enum DocumentSource { CAIRO_BY_EXAMPLE = 'cairo_by_example', OPENZEPPELIN_DOCS = 'openzeppelin_docs', CORELIB_DOCS = 'corelib_docs', + SCARB_DOCS = 'scarb_docs', } export type BookChunk = { diff --git a/packages/ingester/src/IngesterFactory.ts b/packages/ingester/src/IngesterFactory.ts index b33bbee3..f3a97ce7 100644 --- a/packages/ingester/src/IngesterFactory.ts +++ b/packages/ingester/src/IngesterFactory.ts @@ -54,6 +54,10 @@ export class IngesterFactory { } = require('./ingesters/CoreLibDocsIngester'); return new CoreLibDocsIngester(); + case 'scarb_docs': + const { ScarbDocsIngester } = require('./ingesters/ScarbDocsIngester'); + return new ScarbDocsIngester(); + default: throw new Error(`Unsupported source: ${source}`); } @@ -72,6 +76,7 @@ export class IngesterFactory { DocumentSource.CAIRO_BY_EXAMPLE, DocumentSource.OPENZEPPELIN_DOCS, DocumentSource.CORELIB_DOCS, + DocumentSource.SCARB_DOCS, ]; } } diff --git a/packages/ingester/src/generateEmbeddings.ts b/packages/ingester/src/generateEmbeddings.ts index c69c246f..51547552 100644 --- a/packages/ingester/src/generateEmbeddings.ts +++ b/packages/ingester/src/generateEmbeddings.ts @@ -6,7 +6,6 @@ import { loadOpenAIEmbeddingsModels } from '@cairo-coder/backend/config/provider import { DocumentSource } from '@cairo-coder/agents/types/index'; import { IngesterFactory } from './IngesterFactory'; - /** * Global vector store instance */ @@ -138,9 +137,7 @@ async function main() { if (target === 'Everything') { // Ingest all sources const sources = IngesterFactory.getAvailableSources(); - for (const source of sources) { - await ingestSource(source); - } + await Promise.all(sources.map((source) => ingestSource(source))); } else { // Ingest specific source await ingestSource(target); diff --git a/packages/ingester/src/ingesters/ScarbDocsIngester.ts b/packages/ingester/src/ingesters/ScarbDocsIngester.ts new file mode 100644 index 00000000..25c84a81 --- /dev/null +++ b/packages/ingester/src/ingesters/ScarbDocsIngester.ts @@ -0,0 +1,72 @@ +import * as path from 'path'; +import { DocumentSource } from '@cairo-coder/agents/types/index'; +import { BookConfig, BookPageDto } from '../utils/types'; +import { processDocFiles } from '../utils/fileUtils'; +import { logger } from '@cairo-coder/agents/utils/index'; +import { exec as execCallback } from 'child_process'; +import { promisify } from 'util'; +import { MarkdownIngester } from './MarkdownIngester'; + +/** + * Ingester for the Scarb documentation + * + * This ingester downloads the Scarb documentation from the GitHub repository, + * processes the markdown files from the website/docs directory, and creates chunks for the vector store. + */ +export class ScarbDocsIngester extends MarkdownIngester { + /** + * Constructor for the Scarb docs ingester + */ + constructor() { + // Define the configuration for the Scarb documentation + const config: BookConfig = { + repoOwner: 'software-mansion', + repoName: 'scarb', + fileExtension: '.md', + chunkSize: 4096, + chunkOverlap: 512, + }; + + super(config, DocumentSource.SCARB_DOCS); + } + + /** + * Get the directory path for extracting files + * + * @returns string - Path to the extract directory + */ + protected getExtractDir(): string { + return path.join(__dirname, '..', '..', 'temp', 'scarb-docs'); + } + + /** + * Download and extract the repository + * + * @returns Promise - Array of book pages + */ + protected async downloadAndExtractDocs(): Promise { + const extractDir = this.getExtractDir(); + const repoUrl = `https://github.com/${this.config.repoOwner}/${this.config.repoName}.git`; + + logger.info(`Cloning repository from ${repoUrl}`); + + // Clone the repository + const exec = promisify(execCallback); + try { + await exec(`git clone ${repoUrl} ${extractDir}`); + } catch (error) { + logger.error('Error cloning repository:', error); + throw new Error('Failed to clone repository'); + } + + logger.info('Repository cloned successfully.'); + + // Process the markdown files from website/docs directory + const docsDir = path.join(extractDir, 'website', 'docs'); + const pages = await processDocFiles(this.config, docsDir); + + logger.info(`Processed ${pages.length} documentation pages from Scarb`); + + return pages; + } +}