From 868593597a10ac3314ab4cf8e8601bdf1adb0e3e Mon Sep 17 00:00:00 2001 From: enitrat Date: Tue, 15 Jul 2025 01:10:07 +0100 Subject: [PATCH 1/3] migrate to DSPy --- .kiro/specs/agents-python-port/design.md | 807 ++++++++++++++++++ .../specs/agents-python-port/requirements.md | 341 ++++++++ .kiro/specs/agents-python-port/tasks.md | 142 +++ design.md | 807 ++++++++++++++++++ requirements.md | 341 ++++++++ 5 files changed, 2438 insertions(+) create mode 100644 .kiro/specs/agents-python-port/design.md create mode 100644 .kiro/specs/agents-python-port/requirements.md create mode 100644 .kiro/specs/agents-python-port/tasks.md create mode 100644 design.md create mode 100644 requirements.md diff --git a/.kiro/specs/agents-python-port/design.md b/.kiro/specs/agents-python-port/design.md new file mode 100644 index 00000000..b9c8af1d --- /dev/null +++ b/.kiro/specs/agents-python-port/design.md @@ -0,0 +1,807 @@ +# Design Document + +## Overview + +This document describes the design for porting the Cairo Coder agents package from TypeScript to Python using the DSPy framework. The design maintains the same RAG pipeline architecture while leveraging Python's AI ecosystem through a microservice approach that communicates with the existing TypeScript backend. + +## Architecture + +### High-Level Architecture + +```mermaid +graph TB + subgraph "TypeScript Backend" + A[Chat Completion Handler] --> B[Agent Factory Proxy] + B --> C[HTTP/WebSocket Client] + C --> D[Event Emitter Adapter] + end + + subgraph "Python Microservice" + E[FastAPI Server] --> F[Agent Factory] + F --> G[RAG Pipeline] + G --> H[Query Processor] + G --> I[Document Retriever] + G --> J[Response Generator] + end + + subgraph "Shared Infrastructure" + K[PostgreSQL Vector Store] + L[LLM Providers] + M[Configuration Files] + end + + C <--> E + I --> K + H --> L + J --> L + F --> M +``` + +### Communication Flow + +```mermaid +sequenceDiagram + participant TS as TypeScript Backend + participant PY as Python Microservice + participant VS as Vector Store + participant LLM as LLM Provider + + TS->>PY: POST /agents/process (query, history, agentId, mcpMode) + PY->>PY: Load Agent Configuration + PY->>LLM: Process Query (DSPy QueryProcessor) + PY->>VS: Similarity Search + PY->>PY: Rerank Documents + PY-->>TS: Stream: {"type": "sources", "data": [...]} + + alt MCP Mode + PY-->>TS: Stream: {"type": "response", "data": "raw_documents"} + else Normal Mode + PY->>LLM: Generate Response (DSPy Generator) + loop Streaming Response + PY-->>TS: Stream: {"type": "response", "data": "chunk"} + end + end + + PY-->>TS: Stream: {"type": "end"} +``` +## Components and Interfaces + +### 1. FastAPI Microservice Server + +**Purpose**: HTTP/WebSocket server that handles requests from TypeScript backend + +**Interface**: +```python +class AgentServer: + async def process_agent_request( + self, + query: str, + chat_history: List[Message], + agent_id: Optional[str] = None, + mcp_mode: bool = False + ) -> AsyncGenerator[Dict[str, Any], None] +``` + +**Key Features**: +- WebSocket support for real-time streaming +- Request validation and error handling +- CORS configuration for cross-origin requests +- Health check endpoints + +### 2. Agent Factory + +**Purpose**: Creates and configures agents based on agent ID or default configuration + +**Interface**: +```python +class AgentFactory: + @staticmethod + def create_agent( + query: str, + history: List[Message], + vector_store: VectorStore, + mcp_mode: bool = False + ) -> RagPipeline + + @staticmethod + async def create_agent_by_id( + query: str, + history: List[Message], + agent_id: str, + vector_store: VectorStore, + mcp_mode: bool = False + ) -> RagPipeline +``` + +### 3. RAG Pipeline (DSPy-based) + +**Purpose**: Orchestrates the three-stage RAG workflow using DSPy modules + +**Interface**: +```python +class RagPipeline(dspy.Module): + """Main pipeline that chains query processing, retrieval, and generation.""" + + def __init__(self, config: RagSearchConfig): + super().__init__() + self.config = config + + # Initialize DSPy modules for each stage + self.query_processor = QueryProcessor(config.retrieval_program) + self.document_retriever = DocumentRetriever(config) + self.response_generator = config.generation_program + + async def forward( + self, + query: str, + chat_history: List[Message], + mcp_mode: bool = False + ) -> AsyncGenerator[StreamEvent, None]: + """Execute the RAG pipeline with streaming support.""" + + # Stage 1: Process query + processed_query = self.query_processor( + query=query, + chat_history=self._format_history(chat_history) + ) + + # Stage 2: Retrieve documents + documents = await self.document_retriever( + processed_query=processed_query, + sources=self.config.sources + ) + + # Emit sources event + yield StreamEvent(type="sources", data=documents) + + if mcp_mode: + # Return raw documents in MCP mode + yield StreamEvent(type="response", data=self._format_documents(documents)) + else: + # Stage 3: Generate response + context = self._prepare_context(documents) + response = self.response_generator( + query=query, + chat_history=self._format_history(chat_history), + context=context + ) + + # Stream response chunks + for chunk in self._chunk_response(response.answer): + yield StreamEvent(type="response", data=chunk) + + yield StreamEvent(type="end", data=None) +``` +### 4. DSPy Program Mappings + +#### Query Processing Components + +**Retrieval Signature** (maps from retrieval.program.ts): +```python +class CairoQueryAnalysis(dspy.Signature): + """Analyze a Cairo programming query to extract search terms and identify relevant documentation sources.""" + + chat_history = dspy.InputField( + desc="Previous conversation context, may be empty", + default="" + ) + query = dspy.InputField( + desc="User's Cairo/Starknet programming question" + ) + search_terms = dspy.OutputField( + desc="List of specific search terms to find relevant documentation" + ) + resources = dspy.OutputField( + desc="List of documentation sources from: cairo_book, starknet_docs, starknet_foundry, cairo_by_example, openzeppelin_docs, corelib_docs, scarb_docs" + ) + +# Create the retrieval program +retrieval_program = dspy.ChainOfThought(CairoQueryAnalysis) +``` + +**QueryProcessor Module** (maps from queryProcessor.program.ts): +```python +class QueryProcessor(dspy.Module): + """Processes user queries into structured format for retrieval.""" + + def __init__(self, retrieval_program: dspy.Module): + super().__init__() + self.retrieval_program = retrieval_program + + def forward(self, query: str, chat_history: str = "") -> ProcessedQuery: + # Execute the retrieval program + result = self.retrieval_program( + query=query, + chat_history=chat_history + ) + + # Build ProcessedQuery matching TypeScript structure + return ProcessedQuery( + original=query, + transformed=result.search_terms, + is_contract_related=self._is_contract_query(query), + is_test_related=self._is_test_query(query), + resources=self._validate_resources(result.resources) + ) + + def _is_contract_query(self, query: str) -> bool: + """Check if query is about smart contracts.""" + contract_keywords = ['contract', 'interface', 'trait', 'impl', 'storage'] + return any(kw in query.lower() for kw in contract_keywords) + + def _is_test_query(self, query: str) -> bool: + """Check if query is about testing.""" + test_keywords = ['test', 'testing', 'assert', 'mock', 'fixture'] + return any(kw in query.lower() for kw in test_keywords) + + def _validate_resources(self, resources: List[str]) -> List[DocumentSource]: + """Validate and convert resource strings to DocumentSource enum.""" + valid_resources = [] + for r in resources: + try: + valid_resources.append(DocumentSource(r)) + except ValueError: + continue + return valid_resources or [DocumentSource.CAIRO_BOOK] # Default fallback +``` + +#### Document Retrieval Component + +**DocumentRetriever Module** (maps from documentRetriever.program.ts): +```python +class DocumentRetriever(dspy.Module): + """Retrieves and ranks relevant documents from vector store.""" + + def __init__(self, config: RagSearchConfig): + super().__init__() + self.config = config + self.vector_store = config.vector_store + self.embedder = dspy.Embedder(model="text-embedding-3-large") + + async def forward( + self, + processed_query: ProcessedQuery, + sources: List[DocumentSource] + ) -> List[Document]: + """Three-step retrieval process: fetch, rerank, attach metadata.""" + + # Step 1: Fetch documents (maps to fetchDocuments) + docs = await self._fetch_documents(processed_query, sources) + + # Step 2: Rerank documents (maps to rerankDocuments) + if docs: + docs = await self._rerank_documents(processed_query.original, docs) + + # Step 3: Attach sources (maps to attachSources) + return self._attach_sources(docs) + + async def _fetch_documents( + self, + processed_query: ProcessedQuery, + sources: List[DocumentSource] + ) -> List[Document]: + """Fetch documents from vector store.""" + return await self.vector_store.similarity_search( + query=processed_query.original, + k=self.config.max_source_count, + sources=sources + ) + + async def _rerank_documents( + self, + query: str, + docs: List[Document] + ) -> List[Document]: + """Rerank documents by cosine similarity.""" + # Get embeddings + query_embedding = await self.embedder.embed([query]) + doc_texts = [d.page_content for d in docs] + doc_embeddings = await self.embedder.embed(doc_texts) + + # Calculate similarities + similarities = [] + for doc_emb in doc_embeddings: + similarity = self._cosine_similarity(query_embedding[0], doc_emb) + similarities.append(similarity) + + # Filter by threshold and sort + ranked_docs = [ + (doc, sim) for doc, sim in zip(docs, similarities) + if sim >= self.config.similarity_threshold + ] + ranked_docs.sort(key=lambda x: x[1], reverse=True) + + return [doc for doc, _ in ranked_docs[:self.config.max_source_count]] + + def _cosine_similarity(self, a: List[float], b: List[float]) -> float: + """Calculate cosine similarity between two vectors.""" + import numpy as np + return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + + def _attach_sources(self, docs: List[Document]) -> List[Document]: + """Attach metadata like title and URL to documents.""" + for doc in docs: + # Add source metadata based on document source + source = doc.metadata.get('source', '') + doc.metadata['title'] = self._get_title(doc) + doc.metadata['url'] = self._get_url(doc) + return docs +``` + +#### Generation Components + +**Cairo Generation Signature** (maps from generation.program.ts): +```python +class CairoCodeGeneration(dspy.Signature): + """Generate Cairo smart contract code based on context and user query.""" + + chat_history = dspy.InputField( + desc="Previous conversation context for continuity" + ) + query = dspy.InputField( + desc="User's specific Cairo programming question or request" + ) + context = dspy.InputField( + desc="Retrieved Cairo documentation, examples, and relevant information" + ) + answer = dspy.OutputField( + desc="Complete Cairo code solution with explanations, following Cairo syntax and best practices" + ) + +# Create generation program with Chain of Thought reasoning +generation_program = dspy.ChainOfThought( + CairoCodeGeneration, + rationale_field=dspy.OutputField( + prefix="Reasoning: Let me analyze the Cairo requirements step by step.", + desc="Step-by-step analysis of the Cairo programming task" + ) +) +``` + +**Scarb-specific Programs** (maps from scarb-*.program.ts): +```python +class ScarbQueryAnalysis(dspy.Signature): + """Analyze Scarb build tool queries to extract relevant search terms.""" + + chat_history = dspy.InputField(desc="Previous conversation", default="") + query = dspy.InputField(desc="User's Scarb-related question") + search_terms = dspy.OutputField( + desc="Scarb-specific search terms (commands, configuration, dependencies)" + ) + resources = dspy.OutputField( + desc="Always includes 'scarb_docs' as primary source" + ) + +class ScarbGeneration(dspy.Signature): + """Generate Scarb configuration, commands, and troubleshooting guidance.""" + + chat_history = dspy.InputField(desc="Previous conversation") + query = dspy.InputField(desc="User's Scarb question") + context = dspy.InputField(desc="Scarb documentation and examples") + answer = dspy.OutputField( + desc="Scarb commands, TOML configurations, or troubleshooting steps with proper formatting" + ) + +# Create Scarb-specific programs +scarb_retrieval_program = dspy.ChainOfThought(ScarbQueryAnalysis) +scarb_generation_program = dspy.ChainOfThought(ScarbGeneration) +``` + +#### Loading Optimized Configurations + +```python +def load_optimized_programs(programs_dir: str = "optimized_programs"): + """Load DSPy programs with pre-optimized prompts and demonstrations.""" + + programs = {} + + # Load each optimized program + for program_name in ['retrieval', 'generation', 'scarb_retrieval', 'scarb_generation']: + program_path = os.path.join(programs_dir, f"{program_name}.json") + + if os.path.exists(program_path): + # Load optimized program with learned prompts and demos + programs[program_name] = dspy.load(program_path) + else: + # Fallback to base programs + if program_name == 'retrieval': + programs[program_name] = retrieval_program + elif program_name == 'generation': + programs[program_name] = generation_program + elif program_name == 'scarb_retrieval': + programs[program_name] = scarb_retrieval_program + elif program_name == 'scarb_generation': + programs[program_name] = scarb_generation_program + + return programs +``` +### 5. Vector Store Integration + +**Purpose**: Interface with PostgreSQL vector database for document retrieval + +**Interface**: +```python +class VectorStore: + def __init__(self, config: VectorStoreConfig): + self.pool = asyncpg.create_pool(...) + self.embedding_client = OpenAIEmbeddings() + + async def similarity_search( + self, + query: str, + k: int = 5, + sources: Optional[Union[DocumentSource, List[DocumentSource]]] = None + ) -> List[Document] + + async def add_documents( + self, + documents: List[Document], + ids: Optional[List[str]] = None + ) -> None +``` + +### 6. LLM Configuration with DSPy + +**Purpose**: Configure and manage multiple LLM providers through DSPy's unified interface + +**Implementation**: +```python +class LLMConfig: + """Manages LLM configuration for DSPy.""" + + @staticmethod + def configure_providers(config: Config) -> Dict[str, dspy.LM]: + """Configure all available LLM providers.""" + providers = {} + + # Configure OpenAI + if config.openai_api_key: + providers['openai'] = dspy.LM( + model=config.openai_model or "openai/gpt-4o", + api_key=config.openai_api_key, + temperature=config.temperature + ) + + # Configure Anthropic + if config.anthropic_api_key: + providers['anthropic'] = dspy.LM( + model=config.anthropic_model or "anthropic/claude-3-5-sonnet", + api_key=config.anthropic_api_key, + temperature=config.temperature + ) + + # Configure Google Gemini + if config.gemini_api_key: + providers['gemini'] = dspy.LM( + model=config.gemini_model or "google/gemini-1.5-pro", + api_key=config.gemini_api_key, + temperature=config.temperature + ) + + return providers + + @staticmethod + def set_default_lm(providers: Dict[str, dspy.LM], default: str = "openai"): + """Set the default LM for all DSPy operations.""" + if default in providers: + dspy.configure(lm=providers[default]) + elif providers: + # Fallback to first available provider + dspy.configure(lm=next(iter(providers.values()))) + else: + raise ValueError("No LLM providers configured") + +# Usage in initialization +class AgentInitializer: + def __init__(self, config: Config): + # Configure LLM providers + self.providers = LLMConfig.configure_providers(config) + LLMConfig.set_default_lm(self.providers, config.default_provider) + + # Configure embeddings separately if needed + self.embedder = dspy.Embedder( + model=config.embedding_model or "text-embedding-3-large", + api_key=config.openai_api_key # Embeddings typically use OpenAI + ) +``` + +**Streaming Support**: +```python +from dspy.utils import streamify + +class StreamingPipeline: + """Wrapper for streaming DSPy module responses.""" + + def __init__(self, module: dspy.Module): + self.module = module + self.streaming_module = streamify(module) + + async def stream_response( + self, + **kwargs + ) -> AsyncGenerator[str, None]: + """Stream response chunks from the module.""" + async for chunk in self.streaming_module(**kwargs): + yield chunk +``` + +### 7. Configuration Management + +**Purpose**: Load and manage configuration from TOML files and environment variables + +**Interface**: +```python +class ConfigManager: + @staticmethod + def load_config() -> Config: + # Load from config.toml and environment variables + pass + + @staticmethod + def get_agent_config(agent_id: str) -> AgentConfiguration: + # Load agent-specific configuration + pass +```## Da +ta Models + +### Core Data Structures + +```python +@dataclass +class ProcessedQuery: + original: str + transformed: Union[str, List[str]] + is_contract_related: bool = False + is_test_related: bool = False + resources: List[DocumentSource] = field(default_factory=list) + +@dataclass +class Document: + page_content: str + metadata: Dict[str, Any] + +@dataclass +class RagInput: + query: str + chat_history: List[Message] + sources: Union[DocumentSource, List[DocumentSource]] + +@dataclass +class StreamEvent: + type: str # "sources", "response", "end", "error" + data: Any + +@dataclass +class RagSearchConfig: + name: str + vector_store: VectorStore + contract_template: Optional[str] = None + test_template: Optional[str] = None + max_source_count: int = 10 + similarity_threshold: float = 0.4 + sources: Union[DocumentSource, List[DocumentSource]] = None + retrieval_program: dspy.Module = None + generation_program: dspy.Module = None + +class DocumentSource(Enum): + CAIRO_BOOK = "cairo_book" + STARKNET_DOCS = "starknet_docs" + STARKNET_FOUNDRY = "starknet_foundry" + CAIRO_BY_EXAMPLE = "cairo_by_example" + OPENZEPPELIN_DOCS = "openzeppelin_docs" + CORELIB_DOCS = "corelib_docs" + SCARB_DOCS = "scarb_docs" +``` +## Error Handling +### Error Categories + +1. **Configuration Errors**: Missing API keys, invalid agent IDs +2. **Database Errors**: Connection failures, query errors +3. **LLM Provider Errors**: Rate limits, API failures +4. **Validation Errors**: Invalid input parameters +5. **Processing Errors**: Pipeline execution failures + +### Error Response Format + +```python +@dataclass +class ErrorResponse: + type: str # "configuration_error", "database_error", etc. + message: str + details: Optional[Dict[str, Any]] = None + timestamp: datetime = field(default_factory=datetime.now) +``` + +## Testing Strategy + +### Unit Testing with DSPy + +**Testing DSPy Modules**: +```python +import pytest +import dspy +from unittest.mock import Mock, patch + +class TestQueryProcessor: + @pytest.fixture + def mock_lm(self): + """Configure DSPy with a mock LM for testing.""" + mock = Mock() + mock.return_value = dspy.Prediction( + search_terms=["cairo", "contract", "storage"], + resources=["cairo_book", "starknet_docs"] + ) + dspy.configure(lm=mock) + return mock + + def test_query_processing(self, mock_lm): + """Test query processor extracts correct search terms.""" + processor = QueryProcessor(retrieval_program) + result = processor( + query="How do I define storage in a Cairo contract?", + chat_history="" + ) + + assert result.is_contract_related == True + assert "cairo_book" in [r.value for r in result.resources] + assert len(result.transformed) > 0 + +class TestDocumentRetriever: + @pytest.mark.asyncio + async def test_document_ranking(self): + """Test document reranking by similarity.""" + # Mock vector store + mock_store = Mock() + mock_store.similarity_search.return_value = [ + Document(page_content="Cairo storage guide", metadata={"score": 0.9}), + Document(page_content="Irrelevant content", metadata={"score": 0.3}) + ] + + config = RagSearchConfig( + name="test", + vector_store=mock_store, + similarity_threshold=0.5 + ) + + retriever = DocumentRetriever(config) + # Test retrieval and ranking + # ... +``` + +**Testing with DSPy Assertions**: +```python +def test_generation_quality(): + """Test generation produces valid Cairo code.""" + # Create test examples + examples = [ + dspy.Example( + query="Write a simple Cairo contract", + context="Cairo contracts use #[contract] attribute...", + answer="#[contract]\nmod SimpleContract {\n ..." + ).with_inputs("query", "context") + ] + + # Use DSPy's evaluation tools + evaluator = dspy.Evaluate( + devset=examples, + metric=cairo_code_validity_metric + ) + + score = evaluator(generation_program) + assert score > 0.8 # 80% accuracy threshold +``` + +### Integration Testing + +**End-to-End Pipeline Test**: +```python +@pytest.mark.integration +class TestRagPipeline: + async def test_full_pipeline_flow(self): + """Test complete RAG pipeline execution.""" + # Configure test environment + dspy.configure(lm=dspy.LM("openai/gpt-3.5-turbo", api_key="test")) + + # Create pipeline with test config + config = RagSearchConfig( + name="test_agent", + vector_store=test_vector_store, + retrieval_program=retrieval_program, + generation_program=generation_program + ) + + pipeline = RagPipeline(config) + + # Execute pipeline + events = [] + async for event in pipeline.forward( + query="How to create a Cairo contract?", + chat_history=[] + ): + events.append(event) + + # Verify event sequence + assert events[0].type == "sources" + assert any(e.type == "response" for e in events) + assert events[-1].type == "end" +``` + +### Performance Testing with DSPy + +**Optimization and Benchmarking**: +```python +class PerformanceTests: + def test_pipeline_optimization(self): + """Test and optimize pipeline performance.""" + # Create training set for optimization + trainset = load_cairo_training_examples() + + # Optimize with MIPROv2 + optimizer = dspy.MIPROv2( + metric=cairo_accuracy_metric, + auto="light" # Fast optimization for testing + ) + + # Measure optimization time + start_time = time.time() + optimized = optimizer.compile( + pipeline, + trainset=trainset[:50] # Subset for testing + ) + optimization_time = time.time() - start_time + + assert optimization_time < 300 # Should complete within 5 minutes + + # Benchmark optimized vs unoptimized + unopt_score = evaluate_pipeline(pipeline, testset) + opt_score = evaluate_pipeline(optimized, testset) + + assert opt_score > unopt_score # Optimization should improve performance + + @pytest.mark.benchmark + def test_request_throughput(self, benchmark): + """Benchmark request processing throughput.""" + pipeline = create_test_pipeline() + + async def process_request(): + async for _ in pipeline.forward( + query="Simple Cairo query", + chat_history=[] + ): + pass + + # Run benchmark + result = benchmark(asyncio.run, process_request) + + # Assert performance requirements + assert result.stats['mean'] < 2.0 # Average < 2 seconds +``` + +### Mock Strategies for DSPy + +```python +class MockDSPyLM: + """Mock LM for testing without API calls.""" + + def __init__(self, responses: Dict[str, Any]): + self.responses = responses + self.call_count = 0 + + def __call__(self, prompt: str, **kwargs): + self.call_count += 1 + # Return predetermined responses based on prompt content + for key, response in self.responses.items(): + if key in prompt: + return dspy.Prediction(**response) + return dspy.Prediction(answer="Default response") + +# Usage in tests +def test_with_mock_lm(): + mock_lm = MockDSPyLM({ + "storage": {"search_terms": ["storage", "variable"], "resources": ["cairo_book"]}, + "contract": {"answer": "#[contract]\nmod Example {...}"} + }) + + dspy.configure(lm=mock_lm) + # Run tests... +``` \ No newline at end of file diff --git a/.kiro/specs/agents-python-port/requirements.md b/.kiro/specs/agents-python-port/requirements.md new file mode 100644 index 00000000..0bb3d89f --- /dev/null +++ b/.kiro/specs/agents-python-port/requirements.md @@ -0,0 +1,341 @@ +# Requirements Document + +## Introduction + +This document outlines the requirements for porting the Cairo Coder agents package from TypeScript to Python while maintaining compatibility with the existing backend and ingester components. The agents package implements a Retrieval-Augmented Generation (RAG) system specifically designed for Cairo programming language assistance, featuring multi-step AI workflows for query processing, document retrieval, and answer generation. + +## Requirements + +### Requirement 1: Microservice Communication Interface + +**User Story:** As a backend developer, I want the Python agents to run as a separate microservice that communicates with the TypeScript backend, so that I can leverage Python's AI ecosystem while maintaining the existing backend architecture. + +#### Acceptance Criteria + +1. WHEN the backend needs agent processing THEN it SHALL communicate with the Python microservice via HTTP/WebSocket API +2. WHEN the Python service processes a request THEN it SHALL stream responses back to the TypeScript backend in real-time +3. WHEN the agent processes a request THEN it SHALL send events with the same structure: `{'type': 'sources', 'data': documents}` and `{'type': 'response', 'data': content}` +4. WHEN the agent completes processing THEN it SHALL send an 'end' event +5. WHEN an error occurs THEN the agent SHALL send an 'error' event with error details +6. WHEN the TypeScript backend receives events THEN it SHALL convert them to EventEmitter events for backward compatibility + +### Requirement 2: RAG Pipeline Implementation + +**User Story:** As a system architect, I want the Python implementation to maintain the same RAG pipeline structure, so that the system behavior remains consistent. + +#### Acceptance Criteria + +1. WHEN a query is received THEN the system SHALL execute a three-stage pipeline: Query Processing → Document Retrieval → Answer Generation +2. WHEN processing a query THEN the system SHALL use the QueryProcessorProgram to transform the original query into search terms and identify relevant resources +3. WHEN retrieving documents THEN the system SHALL use the DocumentRetrieverProgram to fetch, rerank, and filter documents based on similarity thresholds +4. WHEN generating responses THEN the system SHALL use context from retrieved documents to generate Cairo-specific code solutions +5. WHEN in MCP mode THEN the system SHALL return raw document content instead of generated responses + +### Requirement 3: Agent Configuration System + +**User Story:** As a system administrator, I want to configure different agents with specific capabilities, so that I can provide specialized assistance for different use cases. + +#### Acceptance Criteria + +1. WHEN an agent is requested by ID THEN the system SHALL load the corresponding configuration including sources, templates, and parameters +2. WHEN no agent ID is provided THEN the system SHALL use the default 'cairo-coder' agent configuration +3. WHEN configuring an agent THEN the system SHALL support specifying document sources (cairo_book, starknet_docs, etc.), similarity thresholds, and maximum source counts +4. WHEN using agent templates THEN the system SHALL support contract and test templates for context enhancement +5. WHEN multiple agents are defined THEN the system SHALL support agent-specific retrieval and generation programs + +### Requirement 4: Vector Store Integration + +**User Story:** As a developer, I want the Python agents to integrate with the existing PostgreSQL vector store, so that document retrieval remains consistent. + +#### Acceptance Criteria + +1. WHEN performing similarity search THEN the system SHALL query the PostgreSQL vector store using the same table structure and indices +2. WHEN filtering by document sources THEN the system SHALL support filtering by DocumentSource enum values +3. WHEN computing embeddings THEN the system SHALL use the same embedding model (OpenAI text-embedding-3-large) for consistency +4. WHEN reranking documents THEN the system SHALL compute cosine similarity and filter by configurable thresholds +5. WHEN handling database errors THEN the system SHALL provide appropriate error handling and logging + +### Requirement 5: DSPy Framework Integration + +**User Story:** As an AI developer, I want the Python implementation to use the DSPy framework for structured AI programming, so that I can build modular and optimizable AI components instead of managing brittle prompt strings. + +#### Acceptance Criteria + +1. WHEN implementing AI components THEN the system SHALL use DSPy modules (Predict, ChainOfThought, ProgramOfThought) with structured signatures +2. WHEN defining signatures THEN the system SHALL use `dspy.Signature` classes with `InputField` and `OutputField` specifications: + ```python + class QueryTransformation(dspy.Signature): + """Transform a user query into search terms and identify relevant documentation sources.""" + chat_history = dspy.InputField(desc="Previous conversation context") + query = dspy.InputField(desc="User's Cairo programming question") + search_terms = dspy.OutputField(desc="List of search terms for retrieval") + resources = dspy.OutputField(desc="List of relevant documentation sources") + ``` +3. WHEN composing AI workflows THEN the system SHALL use `dspy.Module` base class and chain DSPy modules: + ```python + class RagPipeline(dspy.Module): + def __init__(self, config): + super().__init__() + self.query_processor = dspy.ChainOfThought(QueryTransformation) + self.document_retriever = DocumentRetriever(config) + self.answer_generator = dspy.ChainOfThought(AnswerGeneration) + + def forward(self, query, history): + # Chain modules together + processed = self.query_processor(query=query, chat_history=history) + docs = self.document_retriever(processed_query=processed, sources=processed.resources) + answer = self.answer_generator(query=query, context=docs, chat_history=history) + return answer + ``` +4. WHEN optimizing performance THEN the system SHALL support DSPy teleprompters (optimizers): + ```python + # Use MIPROv2 for automatic prompt optimization + optimizer = dspy.MIPROv2(metric=cairo_accuracy_metric, auto="medium") + optimized_pipeline = optimizer.compile( + program=rag_pipeline, + trainset=cairo_examples, + requires_permission_to_run=False + ) + + # Or use BootstrapFewShot for simpler optimization + optimizer = dspy.BootstrapFewShot(metric=cairo_accuracy_metric, max_bootstrapped_demos=4) + optimized_pipeline = optimizer.compile(rag_pipeline, trainset=cairo_examples) + ``` +5. WHEN saving/loading programs THEN the system SHALL use DSPy's serialization: + ```python + # Save optimized program with learned prompts and demonstrations + optimized_pipeline.save("optimized_cairo_rag.json") + + # Load for inference + pipeline = dspy.load("optimized_cairo_rag.json") + ``` + +### Requirement 6: Ax-to-DSPy Program Mapping + +**User Story:** As a system architect, I want each Ax Program from the TypeScript implementation to map 1-to-1 to a DSPy module, so that the AI workflow logic remains equivalent between implementations. + +#### Acceptance Criteria + +1. WHEN implementing QueryProcessorProgram THEN it SHALL map to a DSPy module using ChainOfThought: + ```python + class QueryProcessor(dspy.Module): + def __init__(self, retrieval_program): + super().__init__() + self.retrieval_program = retrieval_program + + def forward(self, chat_history: str, query: str) -> ProcessedQuery: + # Use the retrieval program (mapped from retrieval.program.ts) + result = self.retrieval_program(chat_history=chat_history, query=query) + + # Build ProcessedQuery matching TypeScript structure + return ProcessedQuery( + original=query, + transformed=result.search_terms, + is_contract_related=self._check_contract_related(query), + is_test_related=self._check_test_related(query), + resources=self._validate_resources(result.resources) + ) + ``` + +2. WHEN implementing DocumentRetrieverProgram THEN it SHALL map to a DSPy module maintaining the three-step process: + ```python + class DocumentRetriever(dspy.Module): + def __init__(self, config: RagSearchConfig): + super().__init__() + self.config = config + self.vector_store = config.vector_store + self.embedder = dspy.Embedder(model="text-embedding-3-large") + + async def forward(self, processed_query: ProcessedQuery, sources: List[DocumentSource]): + # Step 1: Fetch documents (maps to fetchDocuments) + docs = await self.vector_store.similarity_search( + query=processed_query.original, + k=self.config.max_source_count, + sources=sources + ) + + # Step 2: Rerank documents (maps to rerankDocuments) + query_embedding = await self.embedder.embed([processed_query.original]) + ranked_docs = self._rerank_by_similarity(docs, query_embedding[0]) + + # Step 3: Attach sources (maps to attachSources) + return self._attach_metadata(ranked_docs) + ``` + +3. WHEN implementing GenerationProgram THEN it SHALL use DSPy's ChainOfThought with reasoning: + ```python + class CairoGeneration(dspy.Signature): + """Generate Cairo smart contract code based on context and query.""" + chat_history = dspy.InputField(desc="Previous conversation context") + query = dspy.InputField(desc="User's Cairo programming question") + context = dspy.InputField(desc="Retrieved documentation and examples") + answer = dspy.OutputField(desc="Cairo code solution with explanation") + + # Maps to generation.program.ts + generation_program = dspy.ChainOfThought( + CairoGeneration, + rationale_field=dspy.OutputField( + prefix="Reasoning: Let me analyze the Cairo requirements step by step." + ) + ) + ``` + +4. WHEN implementing specialized Scarb programs THEN they SHALL use domain-specific signatures: + ```python + class ScarbRetrieval(dspy.Signature): + """Extract search terms for Scarb build tool queries.""" + chat_history = dspy.InputField(desc="optional", default="") + query = dspy.InputField() + search_terms = dspy.OutputField(desc="Scarb-specific search terms") + resources = dspy.OutputField(desc="Always includes 'scarb_docs'") + + class ScarbGeneration(dspy.Signature): + """Generate Scarb configuration and command guidance.""" + chat_history = dspy.InputField() + query = dspy.InputField() + context = dspy.InputField(desc="Scarb documentation context") + answer = dspy.OutputField(desc="Scarb commands, TOML configs, or troubleshooting") + ``` + +5. WHEN loading optimized configurations THEN the system SHALL support JSON demos: + ```python + # Load TypeScript-generated optimization data + if os.path.exists("demos/generation_demos.json"): + with open("demos/generation_demos.json") as f: + demos = json.load(f) + generation_program.demos = [dspy.Example(**demo) for demo in demos] + ``` + +### Requirement 7: LLM Provider Integration + +**User Story:** As a system integrator, I want the Python implementation to support the same LLM providers and models through DSPy's LM interface, so that response quality remains consistent. + +#### Acceptance Criteria + +1. WHEN configuring LLM providers THEN the system SHALL use DSPy's unified LM interface: + ```python + # Configure different providers + openai_lm = dspy.LM(model="openai/gpt-4o", api_key=config.openai_key) + anthropic_lm = dspy.LM(model="anthropic/claude-3-5-sonnet", api_key=config.anthropic_key) + gemini_lm = dspy.LM(model="google/gemini-1.5-pro", api_key=config.gemini_key) + + # Set default LM for all DSPy modules + dspy.configure(lm=openai_lm) + ``` + +2. WHEN implementing model routing THEN the system SHALL support provider selection: + ```python + class LLMRouter: + def __init__(self, config: Config): + self.providers = { + "openai": dspy.LM(model=config.openai_model, api_key=config.openai_key), + "anthropic": dspy.LM(model=config.anthropic_model, api_key=config.anthropic_key), + "gemini": dspy.LM(model=config.gemini_model, api_key=config.gemini_key) + } + self.default_provider = config.default_provider + + def get_lm(self, provider: Optional[str] = None) -> dspy.LM: + provider = provider or self.default_provider + return self.providers.get(provider, self.providers[self.default_provider]) + ``` + +3. WHEN streaming responses THEN the system SHALL use DSPy's streaming capabilities: + ```python + from dspy.utils import streamify + + async def stream_generation(pipeline: dspy.Module, query: str, history: List[Message]): + # Enable streaming for the pipeline + streaming_pipeline = streamify(pipeline) + + async for chunk in streaming_pipeline(query=query, history=history): + yield {"type": "response", "data": chunk} + ``` + +4. WHEN tracking usage THEN the system SHALL leverage DSPy's built-in tracking: + ```python + # DSPy automatically tracks usage for each LM call + response = pipeline(query=query, history=history) + + # Access usage information + usage_info = dspy.inspect_history(n=1) + tokens_used = usage_info[-1].get("usage", {}).get("total_tokens", 0) + + # Log usage for monitoring + logger.info(f"Tokens used: {tokens_used}") + ``` + +5. WHEN handling errors THEN the system SHALL use DSPy's error handling: + ```python + try: + response = pipeline(query=query, history=history) + except dspy.errors.LMError as e: + # Handle LLM-specific errors (rate limits, API failures) + logger.error(f"LLM error: {e}") + + # Retry with exponential backoff (built into DSPy) + response = pipeline.forward_with_retry( + query=query, + history=history, + max_retries=3 + ) + ``` + +### Requirement 8: Cairo-Specific Intelligence + +**User Story:** As a Cairo developer, I want the agents to provide accurate Cairo programming assistance, so that I can get relevant help for my coding tasks. + +#### Acceptance Criteria + +1. WHEN processing Cairo queries THEN the system SHALL identify contract-related and test-related queries for specialized handling +2. WHEN generating code THEN the system SHALL produce syntactically correct Cairo code following language conventions +3. WHEN using templates THEN the system SHALL apply contract and test templates to enhance context for specific query types +4. WHEN handling non-Cairo queries THEN the system SHALL respond with appropriate redirection messages +5. WHEN providing examples THEN the system SHALL include proper imports, interface definitions, and implementation patterns + +### Requirement 9: Event-Driven Architecture + +**User Story:** As a backend developer, I want the Python agents to maintain the same event-driven pattern, so that streaming responses work correctly. + +#### Acceptance Criteria + +1. WHEN processing requests THEN the system SHALL emit events asynchronously to allow for streaming responses +2. WHEN sources are retrieved THEN the system SHALL emit a 'sources' event before generating responses +3. WHEN generating responses THEN the system SHALL emit incremental 'response' events for streaming +4. WHEN processing completes THEN the system SHALL emit an 'end' event to signal completion +5. WHEN errors occur THEN the system SHALL emit 'error' events with descriptive error messages + +### Requirement 10: Configuration Management + +**User Story:** As a system administrator, I want the Python implementation to use the same configuration system, so that deployment and management remain consistent. + +#### Acceptance Criteria + +1. WHEN loading configuration THEN the system SHALL read from the same TOML configuration files +2. WHEN accessing API keys THEN the system SHALL support the same environment variable and configuration file structure +3. WHEN configuring providers THEN the system SHALL support the same provider selection and model mapping logic +4. WHEN setting parameters THEN the system SHALL support the same similarity thresholds, source counts, and other tunable parameters +5. WHEN handling missing configuration THEN the system SHALL provide appropriate defaults and error messages + +### Requirement 11: Logging and Observability + +**User Story:** As a system operator, I want the Python implementation to provide the same logging and monitoring capabilities, so that I can troubleshoot issues effectively. + +#### Acceptance Criteria + +1. WHEN processing requests THEN the system SHALL log query processing steps with appropriate detail levels +2. WHEN tracking performance THEN the system SHALL log token usage, response times, and document retrieval metrics +3. WHEN errors occur THEN the system SHALL log detailed error information including stack traces and context +4. WHEN debugging THEN the system SHALL support debug-level logging for detailed pipeline execution traces +5. WHEN monitoring THEN the system SHALL provide metrics compatible with existing monitoring infrastructure + +### Requirement 12: Testing and Quality Assurance + +**User Story:** As a quality assurance engineer, I want comprehensive testing capabilities, so that I can ensure the Python port maintains the same quality and behavior. + +#### Acceptance Criteria + +1. WHEN running unit tests THEN the system SHALL provide test coverage for all major components and workflows +2. WHEN testing agent behavior THEN the system SHALL support mocking of LLM providers and vector stores +3. WHEN validating responses THEN the system SHALL include tests for Cairo code generation quality and accuracy +4. WHEN testing error handling THEN the system SHALL verify appropriate error responses for various failure scenarios +5. WHEN performing integration tests THEN the system SHALL validate end-to-end workflows with real or mock dependencies diff --git a/.kiro/specs/agents-python-port/tasks.md b/.kiro/specs/agents-python-port/tasks.md new file mode 100644 index 00000000..d27613c8 --- /dev/null +++ b/.kiro/specs/agents-python-port/tasks.md @@ -0,0 +1,142 @@ +# Implementation Plan + +- [ ] 1. Set up Python project structure and core dependencies + - Create Python package structure with proper module organization + - Set up pyproject.toml with DSPy, FastAPI, asyncpg, and other core dependencies + - Use `uv` as package manager, build system + - Use context7 if you need to understand how UV works. + - Configure development environment with linting, formatting, and testing tools + - _Requirements: 1.1, 10.1_ + +- [ ] 2. Implement core data models and type definitions + - Create Pydantic models for Message, ProcessedQuery, Document, RagInput, StreamEvent + - Implement DocumentSource enum with all source types + - Define RagSearchConfig and AgentConfiguration dataclasses + - Add type hints and validation for all data structures + - _Requirements: 1.3, 6.1_ + +- [ ] 3. Create configuration management system + - Implement ConfigManager class to load TOML configuration files + - Add environment variable support for API keys and database credentials + - Create agent configuration loading with fallback to defaults + - Add configuration validation and error handling + - _Requirements: 10.1, 10.2, 10.5_ + +- [ ] 4. Implement PostgreSQL vector store integration + - Create VectorStore class with asyncpg connection pooling + - Implement similarity_search method with vector cosine similarity + - Add document insertion and batch processing capabilities + - Implement source filtering and metadata handling + - Add database error handling and connection management + - _Requirements: 4.1, 4.2, 4.3, 4.4_ + +- [ ] 5. Create LLM provider router and integration + - Implement LLMRouter class supporting OpenAI, Anthropic, and Google Gemini + - Add model selection logic based on configuration + - Implement streaming response support for real-time generation + - Add token tracking and usage monitoring + - Implement retry logic and error handling for provider failures + - _Requirements: 7.1, 7.2, 7.3, 7.4, 7.5_ + +- [ ] 6. Implement DSPy QueryProcessorProgram + - Create QueryProcessorProgram as DSPy Module mapping from TypeScript version + - Define DSPy signature: "chat_history?, query -> search_terms, resources" + - Implement forward method to process queries and extract search terms + - Add Cairo/Starknet-specific query analysis logic + - Include few-shot examples for query processing optimization + - _Requirements: 2.2, 5.1, 6.1, 8.1_ + +- [ ] 7. Implement DSPy DocumentRetrieverProgram + - Create DocumentRetrieverProgram as DSPy Module for document retrieval + - Implement document fetching with multiple search terms + - Add document reranking using embedding similarity + - Implement source filtering and deduplication logic + - Add similarity threshold filtering and result limiting + - _Requirements: 2.3, 4.4, 6.2_ + +- [ ] 8. Implement DSPy GenerationProgram + - Create GenerationProgram using DSPy ChainOfThought for Cairo code generation + - Define signature: "chat_history?, query, context -> answer" + - Add Cairo-specific code generation instructions and examples + - Implement contract and test template integration + - Add streaming response support for incremental generation + - _Requirements: 2.4, 5.2, 6.3, 8.2, 8.3_ + +- [ ] 9. Create RAG Pipeline orchestration + - Implement RagPipeline class to orchestrate DSPy programs + - Add three-stage workflow: Query Processing → Document Retrieval → Generation + - Implement MCP mode for raw document return + - Add context building and template application logic + - Implement streaming event emission for real-time updates + - _Requirements: 2.1, 2.5, 9.1, 9.2, 9.3_ + +- [ ] 10. Implement Agent Factory + - Create AgentFactory class with static methods for agent creation + - Implement create_agent method for default agent configuration + - Add create_agent_by_id method for agent-specific configurations + - Load agent configurations and initialize RAG pipelines + - Add agent validation and error handling + - _Requirements: 3.1, 3.2, 3.3, 3.4_ + +- [ ] 11. Create FastAPI microservice server + - Set up FastAPI application with WebSocket support + - Implement /agents/process endpoint for agent requests + - Add request validation using Pydantic models + - Implement streaming response handling via WebSocket + - Add health check endpoints for monitoring + - _Requirements: 1.1, 1.2, 1.6_ + +- [ ] 12. Implement TypeScript backend integration layer + - Create Agent Factory Proxy in TypeScript to communicate with Python service + - Implement HTTP/WebSocket client for Python microservice communication + - Add EventEmitter adapter to convert streaming responses to events + - Modify existing chatCompletionHandler to use proxy instead of direct agent calls + - Maintain backward compatibility with existing API + - _Requirements: 1.1, 1.2, 1.6, 9.4_ + +- [ ] 13. Add comprehensive error handling and logging + - Implement structured error responses with appropriate HTTP status codes + - Add comprehensive logging for all pipeline stages + - Implement token usage tracking and performance metrics + - Add debug-level logging for troubleshooting + - Create error recovery mechanisms for transient failures + - _Requirements: 11.1, 11.2, 11.3, 11.4_ + +- [ ] 14. Create specialized agent implementations + - Implement Scarb Assistant agent with specialized retrieval and generation programs + - Add agent-specific DSPy program configurations + - Create agent templates for contract and test scenarios + - Add agent parameter customization (similarity thresholds, source counts) + - _Requirements: 3.3, 3.4, 6.4_ + +- [ ] 15. Implement comprehensive test suite + - Create unit tests for all DSPy programs with mocked LLM responses + - Add integration tests for complete RAG pipeline workflows + - Implement API endpoint tests for FastAPI server + - Create database integration tests with test PostgreSQL instance + - Add performance tests for throughput and latency measurement + - _Requirements: 12.1, 12.2, 12.3, 12.4, 12.5_ + +- [ ] 16. Add DSPy optimization and fine-tuning + - Implement DSPy optimizers (BootstrapRS, MIPROv2) for program improvement + - Create training datasets for few-shot learning optimization + - Add program compilation and optimization workflows + - Implement evaluation metrics for program performance + - Add automated optimization pipelines + - _Requirements: 5.4, 5.5_ + +- [ ] 17. Create deployment configuration and documentation + - Create Dockerfile for Python microservice containerization + - Add docker-compose configuration for local development + - Create deployment documentation with environment variable setup + - Add API documentation with OpenAPI/Swagger integration + - Create migration guide from TypeScript to Python implementation + - _Requirements: 10.3, 10.4_ + +- [ ] 18. Implement monitoring and observability + - Add Prometheus metrics for request counts, latencies, and error rates + - Implement distributed tracing for request flow monitoring + - Add health check endpoints for service monitoring + - Create alerting configuration for critical failures + - Add performance dashboards for system monitoring + - _Requirements: 11.5_ \ No newline at end of file diff --git a/design.md b/design.md new file mode 100644 index 00000000..d82a1d00 --- /dev/null +++ b/design.md @@ -0,0 +1,807 @@ +# Design Document + +## Overview + +This document describes the design for porting the Cairo Coder agents package from TypeScript to Python using the DSPy framework. The design maintains the same RAG pipeline architecture while leveraging Python's AI ecosystem through a microservice approach that communicates with the existing TypeScript backend. + +## Architecture + +### High-Level Architecture + +```mermaid +graph TB + subgraph "TypeScript Backend" + A[Chat Completion Handler] --> B[Agent Factory Proxy] + B --> C[HTTP/WebSocket Client] + C --> D[Event Emitter Adapter] + end + + subgraph "Python Microservice" + E[FastAPI Server] --> F[Agent Factory] + F --> G[RAG Pipeline] + G --> H[Query Processor] + G --> I[Document Retriever] + G --> J[Response Generator] + end + + subgraph "Shared Infrastructure" + K[PostgreSQL Vector Store] + L[LLM Providers] + M[Configuration Files] + end + + C <--> E + I --> K + H --> L + J --> L + F --> M +``` + +### Communication Flow + +```mermaid +sequenceDiagram + participant TS as TypeScript Backend + participant PY as Python Microservice + participant VS as Vector Store + participant LLM as LLM Provider + + TS->>PY: POST /agents/process (query, history, agentId, mcpMode) + PY->>PY: Load Agent Configuration + PY->>LLM: Process Query (DSPy QueryProcessor) + PY->>VS: Similarity Search + PY->>PY: Rerank Documents + PY-->>TS: Stream: {"type": "sources", "data": [...]} + + alt MCP Mode + PY-->>TS: Stream: {"type": "response", "data": "raw_documents"} + else Normal Mode + PY->>LLM: Generate Response (DSPy Generator) + loop Streaming Response + PY-->>TS: Stream: {"type": "response", "data": "chunk"} + end + end + + PY-->>TS: Stream: {"type": "end"} +``` +## Components and Interfaces + +### 1. FastAPI Microservice Server + +**Purpose**: HTTP/WebSocket server that handles requests from TypeScript backend + +**Interface**: +```python +class AgentServer: + async def process_agent_request( + self, + query: str, + chat_history: List[Message], + agent_id: Optional[str] = None, + mcp_mode: bool = False + ) -> AsyncGenerator[Dict[str, Any], None] +``` + +**Key Features**: +- WebSocket support for real-time streaming +- Request validation and error handling +- CORS configuration for cross-origin requests +- Health check endpoints + +### 2. Agent Factory + +**Purpose**: Creates and configures agents based on agent ID or default configuration + +**Interface**: +```python +class AgentFactory: + @staticmethod + def create_agent( + query: str, + history: List[Message], + vector_store: VectorStore, + mcp_mode: bool = False + ) -> RagPipeline + + @staticmethod + async def create_agent_by_id( + query: str, + history: List[Message], + agent_id: str, + vector_store: VectorStore, + mcp_mode: bool = False + ) -> RagPipeline +``` + +### 3. RAG Pipeline (DSPy-based) + +**Purpose**: Orchestrates the three-stage RAG workflow using DSPy modules + +**Interface**: +```python +class RagPipeline(dspy.Module): + """Main pipeline that chains query processing, retrieval, and generation.""" + + def __init__(self, config: RagSearchConfig): + super().__init__() + self.config = config + + # Initialize DSPy modules for each stage + self.query_processor = QueryProcessor(config.retrieval_program) + self.document_retriever = DocumentRetriever(config) + self.response_generator = config.generation_program + + async def forward( + self, + query: str, + chat_history: List[Message], + mcp_mode: bool = False + ) -> AsyncGenerator[StreamEvent, None]: + """Execute the RAG pipeline with streaming support.""" + + # Stage 1: Process query + processed_query = self.query_processor( + query=query, + chat_history=self._format_history(chat_history) + ) + + # Stage 2: Retrieve documents + documents = await self.document_retriever( + processed_query=processed_query, + sources=self.config.sources + ) + + # Emit sources event + yield StreamEvent(type="sources", data=documents) + + if mcp_mode: + # Return raw documents in MCP mode + yield StreamEvent(type="response", data=self._format_documents(documents)) + else: + # Stage 3: Generate response + context = self._prepare_context(documents) + response = self.response_generator( + query=query, + chat_history=self._format_history(chat_history), + context=context + ) + + # Stream response chunks + for chunk in self._chunk_response(response.answer): + yield StreamEvent(type="response", data=chunk) + + yield StreamEvent(type="end", data=None) +``` +### 4. DSPy Program Mappings + +#### Query Processing Components + +**Retrieval Signature** (maps from retrieval.program.ts): +```python +class CairoQueryAnalysis(dspy.Signature): + """Analyze a Cairo programming query to extract search terms and identify relevant documentation sources.""" + + chat_history = dspy.InputField( + desc="Previous conversation context, may be empty", + default="" + ) + query = dspy.InputField( + desc="User's Cairo/Starknet programming question" + ) + search_terms = dspy.OutputField( + desc="List of specific search terms to find relevant documentation" + ) + resources = dspy.OutputField( + desc="List of documentation sources from: cairo_book, starknet_docs, starknet_foundry, cairo_by_example, openzeppelin_docs, corelib_docs, scarb_docs" + ) + +# Create the retrieval program +retrieval_program = dspy.ChainOfThought(CairoQueryAnalysis) +``` + +**QueryProcessor Module** (maps from queryProcessor.program.ts): +```python +class QueryProcessor(dspy.Module): + """Processes user queries into structured format for retrieval.""" + + def __init__(self, retrieval_program: dspy.Module): + super().__init__() + self.retrieval_program = retrieval_program + + def forward(self, query: str, chat_history: str = "") -> ProcessedQuery: + # Execute the retrieval program + result = self.retrieval_program( + query=query, + chat_history=chat_history + ) + + # Build ProcessedQuery matching TypeScript structure + return ProcessedQuery( + original=query, + transformed=result.search_terms, + is_contract_related=self._is_contract_query(query), + is_test_related=self._is_test_query(query), + resources=self._validate_resources(result.resources) + ) + + def _is_contract_query(self, query: str) -> bool: + """Check if query is about smart contracts.""" + contract_keywords = ['contract', 'interface', 'trait', 'impl', 'storage'] + return any(kw in query.lower() for kw in contract_keywords) + + def _is_test_query(self, query: str) -> bool: + """Check if query is about testing.""" + test_keywords = ['test', 'testing', 'assert', 'mock', 'fixture'] + return any(kw in query.lower() for kw in test_keywords) + + def _validate_resources(self, resources: List[str]) -> List[DocumentSource]: + """Validate and convert resource strings to DocumentSource enum.""" + valid_resources = [] + for r in resources: + try: + valid_resources.append(DocumentSource(r)) + except ValueError: + continue + return valid_resources or [DocumentSource.CAIRO_BOOK] # Default fallback +``` + +#### Document Retrieval Component + +**DocumentRetriever Module** (maps from documentRetriever.program.ts): +```python +class DocumentRetriever(dspy.Module): + """Retrieves and ranks relevant documents from vector store.""" + + def __init__(self, config: RagSearchConfig): + super().__init__() + self.config = config + self.vector_store = config.vector_store + self.embedder = dspy.Embedder(model="text-embedding-3-large") + + async def forward( + self, + processed_query: ProcessedQuery, + sources: List[DocumentSource] + ) -> List[Document]: + """Three-step retrieval process: fetch, rerank, attach metadata.""" + + # Step 1: Fetch documents (maps to fetchDocuments) + docs = await self._fetch_documents(processed_query, sources) + + # Step 2: Rerank documents (maps to rerankDocuments) + if docs: + docs = await self._rerank_documents(processed_query.original, docs) + + # Step 3: Attach sources (maps to attachSources) + return self._attach_sources(docs) + + async def _fetch_documents( + self, + processed_query: ProcessedQuery, + sources: List[DocumentSource] + ) -> List[Document]: + """Fetch documents from vector store.""" + return await self.vector_store.similarity_search( + query=processed_query.original, + k=self.config.max_source_count, + sources=sources + ) + + async def _rerank_documents( + self, + query: str, + docs: List[Document] + ) -> List[Document]: + """Rerank documents by cosine similarity.""" + # Get embeddings + query_embedding = await self.embedder.embed([query]) + doc_texts = [d.page_content for d in docs] + doc_embeddings = await self.embedder.embed(doc_texts) + + # Calculate similarities + similarities = [] + for doc_emb in doc_embeddings: + similarity = self._cosine_similarity(query_embedding[0], doc_emb) + similarities.append(similarity) + + # Filter by threshold and sort + ranked_docs = [ + (doc, sim) for doc, sim in zip(docs, similarities) + if sim >= self.config.similarity_threshold + ] + ranked_docs.sort(key=lambda x: x[1], reverse=True) + + return [doc for doc, _ in ranked_docs[:self.config.max_source_count]] + + def _cosine_similarity(self, a: List[float], b: List[float]) -> float: + """Calculate cosine similarity between two vectors.""" + import numpy as np + return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + + def _attach_sources(self, docs: List[Document]) -> List[Document]: + """Attach metadata like title and URL to documents.""" + for doc in docs: + # Add source metadata based on document source + source = doc.metadata.get('source', '') + doc.metadata['title'] = self._get_title(doc) + doc.metadata['url'] = self._get_url(doc) + return docs +``` + +#### Generation Components + +**Cairo Generation Signature** (maps from generation.program.ts): +```python +class CairoCodeGeneration(dspy.Signature): + """Generate Cairo smart contract code based on context and user query.""" + + chat_history = dspy.InputField( + desc="Previous conversation context for continuity" + ) + query = dspy.InputField( + desc="User's specific Cairo programming question or request" + ) + context = dspy.InputField( + desc="Retrieved Cairo documentation, examples, and relevant information" + ) + answer = dspy.OutputField( + desc="Complete Cairo code solution with explanations, following Cairo syntax and best practices" + ) + +# Create generation program with Chain of Thought reasoning +generation_program = dspy.ChainOfThought( + CairoCodeGeneration, + rationale_field=dspy.OutputField( + prefix="Reasoning: Let me analyze the Cairo requirements step by step.", + desc="Step-by-step analysis of the Cairo programming task" + ) +) +``` + +**Scarb-specific Programs** (maps from scarb-*.program.ts): +```python +class ScarbQueryAnalysis(dspy.Signature): + """Analyze Scarb build tool queries to extract relevant search terms.""" + + chat_history = dspy.InputField(desc="Previous conversation", default="") + query = dspy.InputField(desc="User's Scarb-related question") + search_terms = dspy.OutputField( + desc="Scarb-specific search terms (commands, configuration, dependencies)" + ) + resources = dspy.OutputField( + desc="Always includes 'scarb_docs' as primary source" + ) + +class ScarbGeneration(dspy.Signature): + """Generate Scarb configuration, commands, and troubleshooting guidance.""" + + chat_history = dspy.InputField(desc="Previous conversation") + query = dspy.InputField(desc="User's Scarb question") + context = dspy.InputField(desc="Scarb documentation and examples") + answer = dspy.OutputField( + desc="Scarb commands, TOML configurations, or troubleshooting steps with proper formatting" + ) + +# Create Scarb-specific programs +scarb_retrieval_program = dspy.ChainOfThought(ScarbQueryAnalysis) +scarb_generation_program = dspy.ChainOfThought(ScarbGeneration) +``` + +#### Loading Optimized Configurations + +```python +def load_optimized_programs(programs_dir: str = "optimized_programs"): + """Load DSPy programs with pre-optimized prompts and demonstrations.""" + + programs = {} + + # Load each optimized program + for program_name in ['retrieval', 'generation', 'scarb_retrieval', 'scarb_generation']: + program_path = os.path.join(programs_dir, f"{program_name}.json") + + if os.path.exists(program_path): + # Load optimized program with learned prompts and demos + programs[program_name] = dspy.load(program_path) + else: + # Fallback to base programs + if program_name == 'retrieval': + programs[program_name] = retrieval_program + elif program_name == 'generation': + programs[program_name] = generation_program + elif program_name == 'scarb_retrieval': + programs[program_name] = scarb_retrieval_program + elif program_name == 'scarb_generation': + programs[program_name] = scarb_generation_program + + return programs +``` +### 5. Vector Store Integration + +**Purpose**: Interface with PostgreSQL vector database for document retrieval + +**Interface**: +```python +class VectorStore: + def __init__(self, config: VectorStoreConfig): + self.pool = asyncpg.create_pool(...) + self.embedding_client = OpenAIEmbeddings() + + async def similarity_search( + self, + query: str, + k: int = 5, + sources: Optional[Union[DocumentSource, List[DocumentSource]]] = None + ) -> List[Document] + + async def add_documents( + self, + documents: List[Document], + ids: Optional[List[str]] = None + ) -> None +``` + +### 6. LLM Configuration with DSPy + +**Purpose**: Configure and manage multiple LLM providers through DSPy's unified interface + +**Implementation**: +```python +class LLMConfig: + """Manages LLM configuration for DSPy.""" + + @staticmethod + def configure_providers(config: Config) -> Dict[str, dspy.LM]: + """Configure all available LLM providers.""" + providers = {} + + # Configure OpenAI + if config.openai_api_key: + providers['openai'] = dspy.LM( + model=config.openai_model or "openai/gpt-4o", + api_key=config.openai_api_key, + temperature=config.temperature + ) + + # Configure Anthropic + if config.anthropic_api_key: + providers['anthropic'] = dspy.LM( + model=config.anthropic_model or "anthropic/claude-3-5-sonnet", + api_key=config.anthropic_api_key, + temperature=config.temperature + ) + + # Configure Google Gemini + if config.gemini_api_key: + providers['gemini'] = dspy.LM( + model=config.gemini_model or "google/gemini-1.5-pro", + api_key=config.gemini_api_key, + temperature=config.temperature + ) + + return providers + + @staticmethod + def set_default_lm(providers: Dict[str, dspy.LM], default: str = "openai"): + """Set the default LM for all DSPy operations.""" + if default in providers: + dspy.configure(lm=providers[default]) + elif providers: + # Fallback to first available provider + dspy.configure(lm=next(iter(providers.values()))) + else: + raise ValueError("No LLM providers configured") + +# Usage in initialization +class AgentInitializer: + def __init__(self, config: Config): + # Configure LLM providers + self.providers = LLMConfig.configure_providers(config) + LLMConfig.set_default_lm(self.providers, config.default_provider) + + # Configure embeddings separately if needed + self.embedder = dspy.Embedder( + model=config.embedding_model or "text-embedding-3-large", + api_key=config.openai_api_key # Embeddings typically use OpenAI + ) +``` + +**Streaming Support**: +```python +from dspy.utils import streamify + +class StreamingPipeline: + """Wrapper for streaming DSPy module responses.""" + + def __init__(self, module: dspy.Module): + self.module = module + self.streaming_module = streamify(module) + + async def stream_response( + self, + **kwargs + ) -> AsyncGenerator[str, None]: + """Stream response chunks from the module.""" + async for chunk in self.streaming_module(**kwargs): + yield chunk +``` + +### 7. Configuration Management + +**Purpose**: Load and manage configuration from TOML files and environment variables + +**Interface**: +```python +class ConfigManager: + @staticmethod + def load_config() -> Config: + # Load from config.toml and environment variables + pass + + @staticmethod + def get_agent_config(agent_id: str) -> AgentConfiguration: + # Load agent-specific configuration + pass +```## Da +ta Models + +### Core Data Structures + +```python +@dataclass +class ProcessedQuery: + original: str + transformed: Union[str, List[str]] + is_contract_related: bool = False + is_test_related: bool = False + resources: List[DocumentSource] = field(default_factory=list) + +@dataclass +class Document: + page_content: str + metadata: Dict[str, Any] + +@dataclass +class RagInput: + query: str + chat_history: List[Message] + sources: Union[DocumentSource, List[DocumentSource]] + +@dataclass +class StreamEvent: + type: str # "sources", "response", "end", "error" + data: Any + +@dataclass +class RagSearchConfig: + name: str + vector_store: VectorStore + contract_template: Optional[str] = None + test_template: Optional[str] = None + max_source_count: int = 10 + similarity_threshold: float = 0.4 + sources: Union[DocumentSource, List[DocumentSource]] = None + retrieval_program: dspy.Module = None + generation_program: dspy.Module = None + +class DocumentSource(Enum): + CAIRO_BOOK = "cairo_book" + STARKNET_DOCS = "starknet_docs" + STARKNET_FOUNDRY = "starknet_foundry" + CAIRO_BY_EXAMPLE = "cairo_by_example" + OPENZEPPELIN_DOCS = "openzeppelin_docs" + CORELIB_DOCS = "corelib_docs" + SCARB_DOCS = "scarb_docs" +``` +## Error Handling +### Error Categories + +1. **Configuration Errors**: Missing API keys, invalid agent IDs +2. **Database Errors**: Connection failures, query errors +3. **LLM Provider Errors**: Rate limits, API failures +4. **Validation Errors**: Invalid input parameters +5. **Processing Errors**: Pipeline execution failures + +### Error Response Format + +```python +@dataclass +class ErrorResponse: + type: str # "configuration_error", "database_error", etc. + message: str + details: Optional[Dict[str, Any]] = None + timestamp: datetime = field(default_factory=datetime.now) +``` + +## Testing Strategy + +### Unit Testing with DSPy + +**Testing DSPy Modules**: +```python +import pytest +import dspy +from unittest.mock import Mock, patch + +class TestQueryProcessor: + @pytest.fixture + def mock_lm(self): + """Configure DSPy with a mock LM for testing.""" + mock = Mock() + mock.return_value = dspy.Prediction( + search_terms=["cairo", "contract", "storage"], + resources=["cairo_book", "starknet_docs"] + ) + dspy.configure(lm=mock) + return mock + + def test_query_processing(self, mock_lm): + """Test query processor extracts correct search terms.""" + processor = QueryProcessor(retrieval_program) + result = processor( + query="How do I define storage in a Cairo contract?", + chat_history="" + ) + + assert result.is_contract_related == True + assert "cairo_book" in [r.value for r in result.resources] + assert len(result.transformed) > 0 + +class TestDocumentRetriever: + @pytest.mark.asyncio + async def test_document_ranking(self): + """Test document reranking by similarity.""" + # Mock vector store + mock_store = Mock() + mock_store.similarity_search.return_value = [ + Document(page_content="Cairo storage guide", metadata={"score": 0.9}), + Document(page_content="Irrelevant content", metadata={"score": 0.3}) + ] + + config = RagSearchConfig( + name="test", + vector_store=mock_store, + similarity_threshold=0.5 + ) + + retriever = DocumentRetriever(config) + # Test retrieval and ranking + # ... +``` + +**Testing with DSPy Assertions**: +```python +def test_generation_quality(): + """Test generation produces valid Cairo code.""" + # Create test examples + examples = [ + dspy.Example( + query="Write a simple Cairo contract", + context="Cairo contracts use #[contract] attribute...", + answer="#[contract]\nmod SimpleContract {\n ..." + ).with_inputs("query", "context") + ] + + # Use DSPy's evaluation tools + evaluator = dspy.Evaluate( + devset=examples, + metric=cairo_code_validity_metric + ) + + score = evaluator(generation_program) + assert score > 0.8 # 80% accuracy threshold +``` + +### Integration Testing + +**End-to-End Pipeline Test**: +```python +@pytest.mark.integration +class TestRagPipeline: + async def test_full_pipeline_flow(self): + """Test complete RAG pipeline execution.""" + # Configure test environment + dspy.configure(lm=dspy.LM("openai/gpt-3.5-turbo", api_key="test")) + + # Create pipeline with test config + config = RagSearchConfig( + name="test_agent", + vector_store=test_vector_store, + retrieval_program=retrieval_program, + generation_program=generation_program + ) + + pipeline = RagPipeline(config) + + # Execute pipeline + events = [] + async for event in pipeline.forward( + query="How to create a Cairo contract?", + chat_history=[] + ): + events.append(event) + + # Verify event sequence + assert events[0].type == "sources" + assert any(e.type == "response" for e in events) + assert events[-1].type == "end" +``` + +### Performance Testing with DSPy + +**Optimization and Benchmarking**: +```python +class PerformanceTests: + def test_pipeline_optimization(self): + """Test and optimize pipeline performance.""" + # Create training set for optimization + trainset = load_cairo_training_examples() + + # Optimize with MIPROv2 + optimizer = dspy.MIPROv2( + metric=cairo_accuracy_metric, + auto="light" # Fast optimization for testing + ) + + # Measure optimization time + start_time = time.time() + optimized = optimizer.compile( + pipeline, + trainset=trainset[:50] # Subset for testing + ) + optimization_time = time.time() - start_time + + assert optimization_time < 300 # Should complete within 5 minutes + + # Benchmark optimized vs unoptimized + unopt_score = evaluate_pipeline(pipeline, testset) + opt_score = evaluate_pipeline(optimized, testset) + + assert opt_score > unopt_score # Optimization should improve performance + + @pytest.mark.benchmark + def test_request_throughput(self, benchmark): + """Benchmark request processing throughput.""" + pipeline = create_test_pipeline() + + async def process_request(): + async for _ in pipeline.forward( + query="Simple Cairo query", + chat_history=[] + ): + pass + + # Run benchmark + result = benchmark(asyncio.run, process_request) + + # Assert performance requirements + assert result.stats['mean'] < 2.0 # Average < 2 seconds +``` + +### Mock Strategies for DSPy + +```python +class MockDSPyLM: + """Mock LM for testing without API calls.""" + + def __init__(self, responses: Dict[str, Any]): + self.responses = responses + self.call_count = 0 + + def __call__(self, prompt: str, **kwargs): + self.call_count += 1 + # Return predetermined responses based on prompt content + for key, response in self.responses.items(): + if key in prompt: + return dspy.Prediction(**response) + return dspy.Prediction(answer="Default response") + +# Usage in tests +def test_with_mock_lm(): + mock_lm = MockDSPyLM({ + "storage": {"search_terms": ["storage", "variable"], "resources": ["cairo_book"]}, + "contract": {"answer": "#[contract]\nmod Example {...}"} + }) + + dspy.configure(lm=mock_lm) + # Run tests... +``` diff --git a/requirements.md b/requirements.md new file mode 100644 index 00000000..668d12f7 --- /dev/null +++ b/requirements.md @@ -0,0 +1,341 @@ +# Requirements Document + +## Introduction + +This document outlines the requirements for porting the Cairo Coder agents package from TypeScript to Python while maintaining compatibility with the existing backend and ingester components. The agents package implements a Retrieval-Augmented Generation (RAG) system specifically designed for Cairo programming language assistance, featuring multi-step AI workflows for query processing, document retrieval, and answer generation. + +## Requirements + +### Requirement 1: Microservice Communication Interface + +**User Story:** As a backend developer, I want the Python agents to run as a separate microservice that communicates with the TypeScript backend, so that I can leverage Python's AI ecosystem while maintaining the existing backend architecture. + +#### Acceptance Criteria + +1. WHEN the backend needs agent processing THEN it SHALL communicate with the Python microservice via HTTP/WebSocket API +2. WHEN the Python service processes a request THEN it SHALL stream responses back to the TypeScript backend in real-time +3. WHEN the agent processes a request THEN it SHALL send events with the same structure: `{'type': 'sources', 'data': documents}` and `{'type': 'response', 'data': content}` +4. WHEN the agent completes processing THEN it SHALL send an 'end' event +5. WHEN an error occurs THEN the agent SHALL send an 'error' event with error details +6. WHEN the TypeScript backend receives events THEN it SHALL convert them to EventEmitter events for backward compatibility + +### Requirement 2: RAG Pipeline Implementation + +**User Story:** As a system architect, I want the Python implementation to maintain the same RAG pipeline structure, so that the system behavior remains consistent. + +#### Acceptance Criteria + +1. WHEN a query is received THEN the system SHALL execute a three-stage pipeline: Query Processing → Document Retrieval → Answer Generation +2. WHEN processing a query THEN the system SHALL use the QueryProcessorProgram to transform the original query into search terms and identify relevant resources +3. WHEN retrieving documents THEN the system SHALL use the DocumentRetrieverProgram to fetch, rerank, and filter documents based on similarity thresholds +4. WHEN generating responses THEN the system SHALL use context from retrieved documents to generate Cairo-specific code solutions +5. WHEN in MCP mode THEN the system SHALL return raw document content instead of generated responses + +### Requirement 3: Agent Configuration System + +**User Story:** As a system administrator, I want to configure different agents with specific capabilities, so that I can provide specialized assistance for different use cases. + +#### Acceptance Criteria + +1. WHEN an agent is requested by ID THEN the system SHALL load the corresponding configuration including sources, templates, and parameters +2. WHEN no agent ID is provided THEN the system SHALL use the default 'cairo-coder' agent configuration +3. WHEN configuring an agent THEN the system SHALL support specifying document sources (cairo_book, starknet_docs, etc.), similarity thresholds, and maximum source counts +4. WHEN using agent templates THEN the system SHALL support contract and test templates for context enhancement +5. WHEN multiple agents are defined THEN the system SHALL support agent-specific retrieval and generation programs + +### Requirement 4: Vector Store Integration + +**User Story:** As a developer, I want the Python agents to integrate with the existing PostgreSQL vector store, so that document retrieval remains consistent. + +#### Acceptance Criteria + +1. WHEN performing similarity search THEN the system SHALL query the PostgreSQL vector store using the same table structure and indices +2. WHEN filtering by document sources THEN the system SHALL support filtering by DocumentSource enum values +3. WHEN computing embeddings THEN the system SHALL use the same embedding model (OpenAI text-embedding-3-large) for consistency +4. WHEN reranking documents THEN the system SHALL compute cosine similarity and filter by configurable thresholds +5. WHEN handling database errors THEN the system SHALL provide appropriate error handling and logging + +### Requirement 5: DSPy Framework Integration + +**User Story:** As an AI developer, I want the Python implementation to use the DSPy framework for structured AI programming, so that I can build modular and optimizable AI components instead of managing brittle prompt strings. + +#### Acceptance Criteria + +1. WHEN implementing AI components THEN the system SHALL use DSPy modules (Predict, ChainOfThought, ProgramOfThought) with structured signatures +2. WHEN defining signatures THEN the system SHALL use `dspy.Signature` classes with `InputField` and `OutputField` specifications: + ```python + class QueryTransformation(dspy.Signature): + """Transform a user query into search terms and identify relevant documentation sources.""" + chat_history = dspy.InputField(desc="Previous conversation context") + query = dspy.InputField(desc="User's Cairo programming question") + search_terms = dspy.OutputField(desc="List of search terms for retrieval") + resources = dspy.OutputField(desc="List of relevant documentation sources") + ``` +3. WHEN composing AI workflows THEN the system SHALL use `dspy.Module` base class and chain DSPy modules: + ```python + class RagPipeline(dspy.Module): + def __init__(self, config): + super().__init__() + self.query_processor = dspy.ChainOfThought(QueryTransformation) + self.document_retriever = DocumentRetriever(config) + self.answer_generator = dspy.ChainOfThought(AnswerGeneration) + + def forward(self, query, history): + # Chain modules together + processed = self.query_processor(query=query, chat_history=history) + docs = self.document_retriever(processed_query=processed, sources=processed.resources) + answer = self.answer_generator(query=query, context=docs, chat_history=history) + return answer + ``` +4. WHEN optimizing performance THEN the system SHALL support DSPy teleprompters (optimizers): + ```python + # Use MIPROv2 for automatic prompt optimization + optimizer = dspy.MIPROv2(metric=cairo_accuracy_metric, auto="medium") + optimized_pipeline = optimizer.compile( + program=rag_pipeline, + trainset=cairo_examples, + requires_permission_to_run=False + ) + + # Or use BootstrapFewShot for simpler optimization + optimizer = dspy.BootstrapFewShot(metric=cairo_accuracy_metric, max_bootstrapped_demos=4) + optimized_pipeline = optimizer.compile(rag_pipeline, trainset=cairo_examples) + ``` +5. WHEN saving/loading programs THEN the system SHALL use DSPy's serialization: + ```python + # Save optimized program with learned prompts and demonstrations + optimized_pipeline.save("optimized_cairo_rag.json") + + # Load for inference + pipeline = dspy.load("optimized_cairo_rag.json") + ``` + +### Requirement 6: Ax-to-DSPy Program Mapping + +**User Story:** As a system architect, I want each Ax Program from the TypeScript implementation to map 1-to-1 to a DSPy module, so that the AI workflow logic remains equivalent between implementations. + +#### Acceptance Criteria + +1. WHEN implementing QueryProcessorProgram THEN it SHALL map to a DSPy module using ChainOfThought: + ```python + class QueryProcessor(dspy.Module): + def __init__(self, retrieval_program): + super().__init__() + self.retrieval_program = retrieval_program + + def forward(self, chat_history: str, query: str) -> ProcessedQuery: + # Use the retrieval program (mapped from retrieval.program.ts) + result = self.retrieval_program(chat_history=chat_history, query=query) + + # Build ProcessedQuery matching TypeScript structure + return ProcessedQuery( + original=query, + transformed=result.search_terms, + is_contract_related=self._check_contract_related(query), + is_test_related=self._check_test_related(query), + resources=self._validate_resources(result.resources) + ) + ``` + +2. WHEN implementing DocumentRetrieverProgram THEN it SHALL map to a DSPy module maintaining the three-step process: + ```python + class DocumentRetriever(dspy.Module): + def __init__(self, config: RagSearchConfig): + super().__init__() + self.config = config + self.vector_store = config.vector_store + self.embedder = dspy.Embedder(model="text-embedding-3-large") + + async def forward(self, processed_query: ProcessedQuery, sources: List[DocumentSource]): + # Step 1: Fetch documents (maps to fetchDocuments) + docs = await self.vector_store.similarity_search( + query=processed_query.original, + k=self.config.max_source_count, + sources=sources + ) + + # Step 2: Rerank documents (maps to rerankDocuments) + query_embedding = await self.embedder.embed([processed_query.original]) + ranked_docs = self._rerank_by_similarity(docs, query_embedding[0]) + + # Step 3: Attach sources (maps to attachSources) + return self._attach_metadata(ranked_docs) + ``` + +3. WHEN implementing GenerationProgram THEN it SHALL use DSPy's ChainOfThought with reasoning: + ```python + class CairoGeneration(dspy.Signature): + """Generate Cairo smart contract code based on context and query.""" + chat_history = dspy.InputField(desc="Previous conversation context") + query = dspy.InputField(desc="User's Cairo programming question") + context = dspy.InputField(desc="Retrieved documentation and examples") + answer = dspy.OutputField(desc="Cairo code solution with explanation") + + # Maps to generation.program.ts + generation_program = dspy.ChainOfThought( + CairoGeneration, + rationale_field=dspy.OutputField( + prefix="Reasoning: Let me analyze the Cairo requirements step by step." + ) + ) + ``` + +4. WHEN implementing specialized Scarb programs THEN they SHALL use domain-specific signatures: + ```python + class ScarbRetrieval(dspy.Signature): + """Extract search terms for Scarb build tool queries.""" + chat_history = dspy.InputField(desc="optional", default="") + query = dspy.InputField() + search_terms = dspy.OutputField(desc="Scarb-specific search terms") + resources = dspy.OutputField(desc="Always includes 'scarb_docs'") + + class ScarbGeneration(dspy.Signature): + """Generate Scarb configuration and command guidance.""" + chat_history = dspy.InputField() + query = dspy.InputField() + context = dspy.InputField(desc="Scarb documentation context") + answer = dspy.OutputField(desc="Scarb commands, TOML configs, or troubleshooting") + ``` + +5. WHEN loading optimized configurations THEN the system SHALL support JSON demos: + ```python + # Load TypeScript-generated optimization data + if os.path.exists("demos/generation_demos.json"): + with open("demos/generation_demos.json") as f: + demos = json.load(f) + generation_program.demos = [dspy.Example(**demo) for demo in demos] + ``` + +### Requirement 7: LLM Provider Integration + +**User Story:** As a system integrator, I want the Python implementation to support the same LLM providers and models through DSPy's LM interface, so that response quality remains consistent. + +#### Acceptance Criteria + +1. WHEN configuring LLM providers THEN the system SHALL use DSPy's unified LM interface: + ```python + # Configure different providers + openai_lm = dspy.LM(model="openai/gpt-4o", api_key=config.openai_key) + anthropic_lm = dspy.LM(model="anthropic/claude-3-5-sonnet", api_key=config.anthropic_key) + gemini_lm = dspy.LM(model="google/gemini-1.5-pro", api_key=config.gemini_key) + + # Set default LM for all DSPy modules + dspy.configure(lm=openai_lm) + ``` + +2. WHEN implementing model routing THEN the system SHALL support provider selection: + ```python + class LLMRouter: + def __init__(self, config: Config): + self.providers = { + "openai": dspy.LM(model=config.openai_model, api_key=config.openai_key), + "anthropic": dspy.LM(model=config.anthropic_model, api_key=config.anthropic_key), + "gemini": dspy.LM(model=config.gemini_model, api_key=config.gemini_key) + } + self.default_provider = config.default_provider + + def get_lm(self, provider: Optional[str] = None) -> dspy.LM: + provider = provider or self.default_provider + return self.providers.get(provider, self.providers[self.default_provider]) + ``` + +3. WHEN streaming responses THEN the system SHALL use DSPy's streaming capabilities: + ```python + from dspy.utils import streamify + + async def stream_generation(pipeline: dspy.Module, query: str, history: List[Message]): + # Enable streaming for the pipeline + streaming_pipeline = streamify(pipeline) + + async for chunk in streaming_pipeline(query=query, history=history): + yield {"type": "response", "data": chunk} + ``` + +4. WHEN tracking usage THEN the system SHALL leverage DSPy's built-in tracking: + ```python + # DSPy automatically tracks usage for each LM call + response = pipeline(query=query, history=history) + + # Access usage information + usage_info = dspy.inspect_history(n=1) + tokens_used = usage_info[-1].get("usage", {}).get("total_tokens", 0) + + # Log usage for monitoring + logger.info(f"Tokens used: {tokens_used}") + ``` + +5. WHEN handling errors THEN the system SHALL use DSPy's error handling: + ```python + try: + response = pipeline(query=query, history=history) + except dspy.errors.LMError as e: + # Handle LLM-specific errors (rate limits, API failures) + logger.error(f"LLM error: {e}") + + # Retry with exponential backoff (built into DSPy) + response = pipeline.forward_with_retry( + query=query, + history=history, + max_retries=3 + ) + ``` + +### Requirement 8: Cairo-Specific Intelligence + +**User Story:** As a Cairo developer, I want the agents to provide accurate Cairo programming assistance, so that I can get relevant help for my coding tasks. + +#### Acceptance Criteria + +1. WHEN processing Cairo queries THEN the system SHALL identify contract-related and test-related queries for specialized handling +2. WHEN generating code THEN the system SHALL produce syntactically correct Cairo code following language conventions +3. WHEN using templates THEN the system SHALL apply contract and test templates to enhance context for specific query types +4. WHEN handling non-Cairo queries THEN the system SHALL respond with appropriate redirection messages +5. WHEN providing examples THEN the system SHALL include proper imports, interface definitions, and implementation patterns + +### Requirement 9: Event-Driven Architecture + +**User Story:** As a backend developer, I want the Python agents to maintain the same event-driven pattern, so that streaming responses work correctly. + +#### Acceptance Criteria + +1. WHEN processing requests THEN the system SHALL emit events asynchronously to allow for streaming responses +2. WHEN sources are retrieved THEN the system SHALL emit a 'sources' event before generating responses +3. WHEN generating responses THEN the system SHALL emit incremental 'response' events for streaming +4. WHEN processing completes THEN the system SHALL emit an 'end' event to signal completion +5. WHEN errors occur THEN the system SHALL emit 'error' events with descriptive error messages + +### Requirement 10: Configuration Management + +**User Story:** As a system administrator, I want the Python implementation to use the same configuration system, so that deployment and management remain consistent. + +#### Acceptance Criteria + +1. WHEN loading configuration THEN the system SHALL read from the same TOML configuration files +2. WHEN accessing API keys THEN the system SHALL support the same environment variable and configuration file structure +3. WHEN configuring providers THEN the system SHALL support the same provider selection and model mapping logic +4. WHEN setting parameters THEN the system SHALL support the same similarity thresholds, source counts, and other tunable parameters +5. WHEN handling missing configuration THEN the system SHALL provide appropriate defaults and error messages + +### Requirement 11: Logging and Observability + +**User Story:** As a system operator, I want the Python implementation to provide the same logging and monitoring capabilities, so that I can troubleshoot issues effectively. + +#### Acceptance Criteria + +1. WHEN processing requests THEN the system SHALL log query processing steps with appropriate detail levels +2. WHEN tracking performance THEN the system SHALL log token usage, response times, and document retrieval metrics +3. WHEN errors occur THEN the system SHALL log detailed error information including stack traces and context +4. WHEN debugging THEN the system SHALL support debug-level logging for detailed pipeline execution traces +5. WHEN monitoring THEN the system SHALL provide metrics compatible with existing monitoring infrastructure + +### Requirement 12: Testing and Quality Assurance + +**User Story:** As a quality assurance engineer, I want comprehensive testing capabilities, so that I can ensure the Python port maintains the same quality and behavior. + +#### Acceptance Criteria + +1. WHEN running unit tests THEN the system SHALL provide test coverage for all major components and workflows +2. WHEN testing agent behavior THEN the system SHALL support mocking of LLM providers and vector stores +3. WHEN validating responses THEN the system SHALL include tests for Cairo code generation quality and accuracy +4. WHEN testing error handling THEN the system SHALL verify appropriate error responses for various failure scenarios +5. WHEN performing integration tests THEN the system SHALL validate end-to-end workflows with real or mock dependencies From ce433a4b53b0a0af7d065acd5fc21ab169d854f7 Mon Sep 17 00:00:00 2001 From: enitrat Date: Tue, 15 Jul 2025 11:25:26 +0100 Subject: [PATCH 2/3] dspy migration --- .kiro/specs/agents-python-port/design.md | 807 ------------------ .../specs/agents-python-port/requirements.md | 341 -------- .kiro/specs/agents-python-port/tasks.md | 142 --- design.md | 807 ------------------ requirements.md | 341 -------- 5 files changed, 2438 deletions(-) delete mode 100644 .kiro/specs/agents-python-port/design.md delete mode 100644 .kiro/specs/agents-python-port/requirements.md delete mode 100644 .kiro/specs/agents-python-port/tasks.md delete mode 100644 design.md delete mode 100644 requirements.md diff --git a/.kiro/specs/agents-python-port/design.md b/.kiro/specs/agents-python-port/design.md deleted file mode 100644 index b9c8af1d..00000000 --- a/.kiro/specs/agents-python-port/design.md +++ /dev/null @@ -1,807 +0,0 @@ -# Design Document - -## Overview - -This document describes the design for porting the Cairo Coder agents package from TypeScript to Python using the DSPy framework. The design maintains the same RAG pipeline architecture while leveraging Python's AI ecosystem through a microservice approach that communicates with the existing TypeScript backend. - -## Architecture - -### High-Level Architecture - -```mermaid -graph TB - subgraph "TypeScript Backend" - A[Chat Completion Handler] --> B[Agent Factory Proxy] - B --> C[HTTP/WebSocket Client] - C --> D[Event Emitter Adapter] - end - - subgraph "Python Microservice" - E[FastAPI Server] --> F[Agent Factory] - F --> G[RAG Pipeline] - G --> H[Query Processor] - G --> I[Document Retriever] - G --> J[Response Generator] - end - - subgraph "Shared Infrastructure" - K[PostgreSQL Vector Store] - L[LLM Providers] - M[Configuration Files] - end - - C <--> E - I --> K - H --> L - J --> L - F --> M -``` - -### Communication Flow - -```mermaid -sequenceDiagram - participant TS as TypeScript Backend - participant PY as Python Microservice - participant VS as Vector Store - participant LLM as LLM Provider - - TS->>PY: POST /agents/process (query, history, agentId, mcpMode) - PY->>PY: Load Agent Configuration - PY->>LLM: Process Query (DSPy QueryProcessor) - PY->>VS: Similarity Search - PY->>PY: Rerank Documents - PY-->>TS: Stream: {"type": "sources", "data": [...]} - - alt MCP Mode - PY-->>TS: Stream: {"type": "response", "data": "raw_documents"} - else Normal Mode - PY->>LLM: Generate Response (DSPy Generator) - loop Streaming Response - PY-->>TS: Stream: {"type": "response", "data": "chunk"} - end - end - - PY-->>TS: Stream: {"type": "end"} -``` -## Components and Interfaces - -### 1. FastAPI Microservice Server - -**Purpose**: HTTP/WebSocket server that handles requests from TypeScript backend - -**Interface**: -```python -class AgentServer: - async def process_agent_request( - self, - query: str, - chat_history: List[Message], - agent_id: Optional[str] = None, - mcp_mode: bool = False - ) -> AsyncGenerator[Dict[str, Any], None] -``` - -**Key Features**: -- WebSocket support for real-time streaming -- Request validation and error handling -- CORS configuration for cross-origin requests -- Health check endpoints - -### 2. Agent Factory - -**Purpose**: Creates and configures agents based on agent ID or default configuration - -**Interface**: -```python -class AgentFactory: - @staticmethod - def create_agent( - query: str, - history: List[Message], - vector_store: VectorStore, - mcp_mode: bool = False - ) -> RagPipeline - - @staticmethod - async def create_agent_by_id( - query: str, - history: List[Message], - agent_id: str, - vector_store: VectorStore, - mcp_mode: bool = False - ) -> RagPipeline -``` - -### 3. RAG Pipeline (DSPy-based) - -**Purpose**: Orchestrates the three-stage RAG workflow using DSPy modules - -**Interface**: -```python -class RagPipeline(dspy.Module): - """Main pipeline that chains query processing, retrieval, and generation.""" - - def __init__(self, config: RagSearchConfig): - super().__init__() - self.config = config - - # Initialize DSPy modules for each stage - self.query_processor = QueryProcessor(config.retrieval_program) - self.document_retriever = DocumentRetriever(config) - self.response_generator = config.generation_program - - async def forward( - self, - query: str, - chat_history: List[Message], - mcp_mode: bool = False - ) -> AsyncGenerator[StreamEvent, None]: - """Execute the RAG pipeline with streaming support.""" - - # Stage 1: Process query - processed_query = self.query_processor( - query=query, - chat_history=self._format_history(chat_history) - ) - - # Stage 2: Retrieve documents - documents = await self.document_retriever( - processed_query=processed_query, - sources=self.config.sources - ) - - # Emit sources event - yield StreamEvent(type="sources", data=documents) - - if mcp_mode: - # Return raw documents in MCP mode - yield StreamEvent(type="response", data=self._format_documents(documents)) - else: - # Stage 3: Generate response - context = self._prepare_context(documents) - response = self.response_generator( - query=query, - chat_history=self._format_history(chat_history), - context=context - ) - - # Stream response chunks - for chunk in self._chunk_response(response.answer): - yield StreamEvent(type="response", data=chunk) - - yield StreamEvent(type="end", data=None) -``` -### 4. DSPy Program Mappings - -#### Query Processing Components - -**Retrieval Signature** (maps from retrieval.program.ts): -```python -class CairoQueryAnalysis(dspy.Signature): - """Analyze a Cairo programming query to extract search terms and identify relevant documentation sources.""" - - chat_history = dspy.InputField( - desc="Previous conversation context, may be empty", - default="" - ) - query = dspy.InputField( - desc="User's Cairo/Starknet programming question" - ) - search_terms = dspy.OutputField( - desc="List of specific search terms to find relevant documentation" - ) - resources = dspy.OutputField( - desc="List of documentation sources from: cairo_book, starknet_docs, starknet_foundry, cairo_by_example, openzeppelin_docs, corelib_docs, scarb_docs" - ) - -# Create the retrieval program -retrieval_program = dspy.ChainOfThought(CairoQueryAnalysis) -``` - -**QueryProcessor Module** (maps from queryProcessor.program.ts): -```python -class QueryProcessor(dspy.Module): - """Processes user queries into structured format for retrieval.""" - - def __init__(self, retrieval_program: dspy.Module): - super().__init__() - self.retrieval_program = retrieval_program - - def forward(self, query: str, chat_history: str = "") -> ProcessedQuery: - # Execute the retrieval program - result = self.retrieval_program( - query=query, - chat_history=chat_history - ) - - # Build ProcessedQuery matching TypeScript structure - return ProcessedQuery( - original=query, - transformed=result.search_terms, - is_contract_related=self._is_contract_query(query), - is_test_related=self._is_test_query(query), - resources=self._validate_resources(result.resources) - ) - - def _is_contract_query(self, query: str) -> bool: - """Check if query is about smart contracts.""" - contract_keywords = ['contract', 'interface', 'trait', 'impl', 'storage'] - return any(kw in query.lower() for kw in contract_keywords) - - def _is_test_query(self, query: str) -> bool: - """Check if query is about testing.""" - test_keywords = ['test', 'testing', 'assert', 'mock', 'fixture'] - return any(kw in query.lower() for kw in test_keywords) - - def _validate_resources(self, resources: List[str]) -> List[DocumentSource]: - """Validate and convert resource strings to DocumentSource enum.""" - valid_resources = [] - for r in resources: - try: - valid_resources.append(DocumentSource(r)) - except ValueError: - continue - return valid_resources or [DocumentSource.CAIRO_BOOK] # Default fallback -``` - -#### Document Retrieval Component - -**DocumentRetriever Module** (maps from documentRetriever.program.ts): -```python -class DocumentRetriever(dspy.Module): - """Retrieves and ranks relevant documents from vector store.""" - - def __init__(self, config: RagSearchConfig): - super().__init__() - self.config = config - self.vector_store = config.vector_store - self.embedder = dspy.Embedder(model="text-embedding-3-large") - - async def forward( - self, - processed_query: ProcessedQuery, - sources: List[DocumentSource] - ) -> List[Document]: - """Three-step retrieval process: fetch, rerank, attach metadata.""" - - # Step 1: Fetch documents (maps to fetchDocuments) - docs = await self._fetch_documents(processed_query, sources) - - # Step 2: Rerank documents (maps to rerankDocuments) - if docs: - docs = await self._rerank_documents(processed_query.original, docs) - - # Step 3: Attach sources (maps to attachSources) - return self._attach_sources(docs) - - async def _fetch_documents( - self, - processed_query: ProcessedQuery, - sources: List[DocumentSource] - ) -> List[Document]: - """Fetch documents from vector store.""" - return await self.vector_store.similarity_search( - query=processed_query.original, - k=self.config.max_source_count, - sources=sources - ) - - async def _rerank_documents( - self, - query: str, - docs: List[Document] - ) -> List[Document]: - """Rerank documents by cosine similarity.""" - # Get embeddings - query_embedding = await self.embedder.embed([query]) - doc_texts = [d.page_content for d in docs] - doc_embeddings = await self.embedder.embed(doc_texts) - - # Calculate similarities - similarities = [] - for doc_emb in doc_embeddings: - similarity = self._cosine_similarity(query_embedding[0], doc_emb) - similarities.append(similarity) - - # Filter by threshold and sort - ranked_docs = [ - (doc, sim) for doc, sim in zip(docs, similarities) - if sim >= self.config.similarity_threshold - ] - ranked_docs.sort(key=lambda x: x[1], reverse=True) - - return [doc for doc, _ in ranked_docs[:self.config.max_source_count]] - - def _cosine_similarity(self, a: List[float], b: List[float]) -> float: - """Calculate cosine similarity between two vectors.""" - import numpy as np - return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) - - def _attach_sources(self, docs: List[Document]) -> List[Document]: - """Attach metadata like title and URL to documents.""" - for doc in docs: - # Add source metadata based on document source - source = doc.metadata.get('source', '') - doc.metadata['title'] = self._get_title(doc) - doc.metadata['url'] = self._get_url(doc) - return docs -``` - -#### Generation Components - -**Cairo Generation Signature** (maps from generation.program.ts): -```python -class CairoCodeGeneration(dspy.Signature): - """Generate Cairo smart contract code based on context and user query.""" - - chat_history = dspy.InputField( - desc="Previous conversation context for continuity" - ) - query = dspy.InputField( - desc="User's specific Cairo programming question or request" - ) - context = dspy.InputField( - desc="Retrieved Cairo documentation, examples, and relevant information" - ) - answer = dspy.OutputField( - desc="Complete Cairo code solution with explanations, following Cairo syntax and best practices" - ) - -# Create generation program with Chain of Thought reasoning -generation_program = dspy.ChainOfThought( - CairoCodeGeneration, - rationale_field=dspy.OutputField( - prefix="Reasoning: Let me analyze the Cairo requirements step by step.", - desc="Step-by-step analysis of the Cairo programming task" - ) -) -``` - -**Scarb-specific Programs** (maps from scarb-*.program.ts): -```python -class ScarbQueryAnalysis(dspy.Signature): - """Analyze Scarb build tool queries to extract relevant search terms.""" - - chat_history = dspy.InputField(desc="Previous conversation", default="") - query = dspy.InputField(desc="User's Scarb-related question") - search_terms = dspy.OutputField( - desc="Scarb-specific search terms (commands, configuration, dependencies)" - ) - resources = dspy.OutputField( - desc="Always includes 'scarb_docs' as primary source" - ) - -class ScarbGeneration(dspy.Signature): - """Generate Scarb configuration, commands, and troubleshooting guidance.""" - - chat_history = dspy.InputField(desc="Previous conversation") - query = dspy.InputField(desc="User's Scarb question") - context = dspy.InputField(desc="Scarb documentation and examples") - answer = dspy.OutputField( - desc="Scarb commands, TOML configurations, or troubleshooting steps with proper formatting" - ) - -# Create Scarb-specific programs -scarb_retrieval_program = dspy.ChainOfThought(ScarbQueryAnalysis) -scarb_generation_program = dspy.ChainOfThought(ScarbGeneration) -``` - -#### Loading Optimized Configurations - -```python -def load_optimized_programs(programs_dir: str = "optimized_programs"): - """Load DSPy programs with pre-optimized prompts and demonstrations.""" - - programs = {} - - # Load each optimized program - for program_name in ['retrieval', 'generation', 'scarb_retrieval', 'scarb_generation']: - program_path = os.path.join(programs_dir, f"{program_name}.json") - - if os.path.exists(program_path): - # Load optimized program with learned prompts and demos - programs[program_name] = dspy.load(program_path) - else: - # Fallback to base programs - if program_name == 'retrieval': - programs[program_name] = retrieval_program - elif program_name == 'generation': - programs[program_name] = generation_program - elif program_name == 'scarb_retrieval': - programs[program_name] = scarb_retrieval_program - elif program_name == 'scarb_generation': - programs[program_name] = scarb_generation_program - - return programs -``` -### 5. Vector Store Integration - -**Purpose**: Interface with PostgreSQL vector database for document retrieval - -**Interface**: -```python -class VectorStore: - def __init__(self, config: VectorStoreConfig): - self.pool = asyncpg.create_pool(...) - self.embedding_client = OpenAIEmbeddings() - - async def similarity_search( - self, - query: str, - k: int = 5, - sources: Optional[Union[DocumentSource, List[DocumentSource]]] = None - ) -> List[Document] - - async def add_documents( - self, - documents: List[Document], - ids: Optional[List[str]] = None - ) -> None -``` - -### 6. LLM Configuration with DSPy - -**Purpose**: Configure and manage multiple LLM providers through DSPy's unified interface - -**Implementation**: -```python -class LLMConfig: - """Manages LLM configuration for DSPy.""" - - @staticmethod - def configure_providers(config: Config) -> Dict[str, dspy.LM]: - """Configure all available LLM providers.""" - providers = {} - - # Configure OpenAI - if config.openai_api_key: - providers['openai'] = dspy.LM( - model=config.openai_model or "openai/gpt-4o", - api_key=config.openai_api_key, - temperature=config.temperature - ) - - # Configure Anthropic - if config.anthropic_api_key: - providers['anthropic'] = dspy.LM( - model=config.anthropic_model or "anthropic/claude-3-5-sonnet", - api_key=config.anthropic_api_key, - temperature=config.temperature - ) - - # Configure Google Gemini - if config.gemini_api_key: - providers['gemini'] = dspy.LM( - model=config.gemini_model or "google/gemini-1.5-pro", - api_key=config.gemini_api_key, - temperature=config.temperature - ) - - return providers - - @staticmethod - def set_default_lm(providers: Dict[str, dspy.LM], default: str = "openai"): - """Set the default LM for all DSPy operations.""" - if default in providers: - dspy.configure(lm=providers[default]) - elif providers: - # Fallback to first available provider - dspy.configure(lm=next(iter(providers.values()))) - else: - raise ValueError("No LLM providers configured") - -# Usage in initialization -class AgentInitializer: - def __init__(self, config: Config): - # Configure LLM providers - self.providers = LLMConfig.configure_providers(config) - LLMConfig.set_default_lm(self.providers, config.default_provider) - - # Configure embeddings separately if needed - self.embedder = dspy.Embedder( - model=config.embedding_model or "text-embedding-3-large", - api_key=config.openai_api_key # Embeddings typically use OpenAI - ) -``` - -**Streaming Support**: -```python -from dspy.utils import streamify - -class StreamingPipeline: - """Wrapper for streaming DSPy module responses.""" - - def __init__(self, module: dspy.Module): - self.module = module - self.streaming_module = streamify(module) - - async def stream_response( - self, - **kwargs - ) -> AsyncGenerator[str, None]: - """Stream response chunks from the module.""" - async for chunk in self.streaming_module(**kwargs): - yield chunk -``` - -### 7. Configuration Management - -**Purpose**: Load and manage configuration from TOML files and environment variables - -**Interface**: -```python -class ConfigManager: - @staticmethod - def load_config() -> Config: - # Load from config.toml and environment variables - pass - - @staticmethod - def get_agent_config(agent_id: str) -> AgentConfiguration: - # Load agent-specific configuration - pass -```## Da -ta Models - -### Core Data Structures - -```python -@dataclass -class ProcessedQuery: - original: str - transformed: Union[str, List[str]] - is_contract_related: bool = False - is_test_related: bool = False - resources: List[DocumentSource] = field(default_factory=list) - -@dataclass -class Document: - page_content: str - metadata: Dict[str, Any] - -@dataclass -class RagInput: - query: str - chat_history: List[Message] - sources: Union[DocumentSource, List[DocumentSource]] - -@dataclass -class StreamEvent: - type: str # "sources", "response", "end", "error" - data: Any - -@dataclass -class RagSearchConfig: - name: str - vector_store: VectorStore - contract_template: Optional[str] = None - test_template: Optional[str] = None - max_source_count: int = 10 - similarity_threshold: float = 0.4 - sources: Union[DocumentSource, List[DocumentSource]] = None - retrieval_program: dspy.Module = None - generation_program: dspy.Module = None - -class DocumentSource(Enum): - CAIRO_BOOK = "cairo_book" - STARKNET_DOCS = "starknet_docs" - STARKNET_FOUNDRY = "starknet_foundry" - CAIRO_BY_EXAMPLE = "cairo_by_example" - OPENZEPPELIN_DOCS = "openzeppelin_docs" - CORELIB_DOCS = "corelib_docs" - SCARB_DOCS = "scarb_docs" -``` -## Error Handling -### Error Categories - -1. **Configuration Errors**: Missing API keys, invalid agent IDs -2. **Database Errors**: Connection failures, query errors -3. **LLM Provider Errors**: Rate limits, API failures -4. **Validation Errors**: Invalid input parameters -5. **Processing Errors**: Pipeline execution failures - -### Error Response Format - -```python -@dataclass -class ErrorResponse: - type: str # "configuration_error", "database_error", etc. - message: str - details: Optional[Dict[str, Any]] = None - timestamp: datetime = field(default_factory=datetime.now) -``` - -## Testing Strategy - -### Unit Testing with DSPy - -**Testing DSPy Modules**: -```python -import pytest -import dspy -from unittest.mock import Mock, patch - -class TestQueryProcessor: - @pytest.fixture - def mock_lm(self): - """Configure DSPy with a mock LM for testing.""" - mock = Mock() - mock.return_value = dspy.Prediction( - search_terms=["cairo", "contract", "storage"], - resources=["cairo_book", "starknet_docs"] - ) - dspy.configure(lm=mock) - return mock - - def test_query_processing(self, mock_lm): - """Test query processor extracts correct search terms.""" - processor = QueryProcessor(retrieval_program) - result = processor( - query="How do I define storage in a Cairo contract?", - chat_history="" - ) - - assert result.is_contract_related == True - assert "cairo_book" in [r.value for r in result.resources] - assert len(result.transformed) > 0 - -class TestDocumentRetriever: - @pytest.mark.asyncio - async def test_document_ranking(self): - """Test document reranking by similarity.""" - # Mock vector store - mock_store = Mock() - mock_store.similarity_search.return_value = [ - Document(page_content="Cairo storage guide", metadata={"score": 0.9}), - Document(page_content="Irrelevant content", metadata={"score": 0.3}) - ] - - config = RagSearchConfig( - name="test", - vector_store=mock_store, - similarity_threshold=0.5 - ) - - retriever = DocumentRetriever(config) - # Test retrieval and ranking - # ... -``` - -**Testing with DSPy Assertions**: -```python -def test_generation_quality(): - """Test generation produces valid Cairo code.""" - # Create test examples - examples = [ - dspy.Example( - query="Write a simple Cairo contract", - context="Cairo contracts use #[contract] attribute...", - answer="#[contract]\nmod SimpleContract {\n ..." - ).with_inputs("query", "context") - ] - - # Use DSPy's evaluation tools - evaluator = dspy.Evaluate( - devset=examples, - metric=cairo_code_validity_metric - ) - - score = evaluator(generation_program) - assert score > 0.8 # 80% accuracy threshold -``` - -### Integration Testing - -**End-to-End Pipeline Test**: -```python -@pytest.mark.integration -class TestRagPipeline: - async def test_full_pipeline_flow(self): - """Test complete RAG pipeline execution.""" - # Configure test environment - dspy.configure(lm=dspy.LM("openai/gpt-3.5-turbo", api_key="test")) - - # Create pipeline with test config - config = RagSearchConfig( - name="test_agent", - vector_store=test_vector_store, - retrieval_program=retrieval_program, - generation_program=generation_program - ) - - pipeline = RagPipeline(config) - - # Execute pipeline - events = [] - async for event in pipeline.forward( - query="How to create a Cairo contract?", - chat_history=[] - ): - events.append(event) - - # Verify event sequence - assert events[0].type == "sources" - assert any(e.type == "response" for e in events) - assert events[-1].type == "end" -``` - -### Performance Testing with DSPy - -**Optimization and Benchmarking**: -```python -class PerformanceTests: - def test_pipeline_optimization(self): - """Test and optimize pipeline performance.""" - # Create training set for optimization - trainset = load_cairo_training_examples() - - # Optimize with MIPROv2 - optimizer = dspy.MIPROv2( - metric=cairo_accuracy_metric, - auto="light" # Fast optimization for testing - ) - - # Measure optimization time - start_time = time.time() - optimized = optimizer.compile( - pipeline, - trainset=trainset[:50] # Subset for testing - ) - optimization_time = time.time() - start_time - - assert optimization_time < 300 # Should complete within 5 minutes - - # Benchmark optimized vs unoptimized - unopt_score = evaluate_pipeline(pipeline, testset) - opt_score = evaluate_pipeline(optimized, testset) - - assert opt_score > unopt_score # Optimization should improve performance - - @pytest.mark.benchmark - def test_request_throughput(self, benchmark): - """Benchmark request processing throughput.""" - pipeline = create_test_pipeline() - - async def process_request(): - async for _ in pipeline.forward( - query="Simple Cairo query", - chat_history=[] - ): - pass - - # Run benchmark - result = benchmark(asyncio.run, process_request) - - # Assert performance requirements - assert result.stats['mean'] < 2.0 # Average < 2 seconds -``` - -### Mock Strategies for DSPy - -```python -class MockDSPyLM: - """Mock LM for testing without API calls.""" - - def __init__(self, responses: Dict[str, Any]): - self.responses = responses - self.call_count = 0 - - def __call__(self, prompt: str, **kwargs): - self.call_count += 1 - # Return predetermined responses based on prompt content - for key, response in self.responses.items(): - if key in prompt: - return dspy.Prediction(**response) - return dspy.Prediction(answer="Default response") - -# Usage in tests -def test_with_mock_lm(): - mock_lm = MockDSPyLM({ - "storage": {"search_terms": ["storage", "variable"], "resources": ["cairo_book"]}, - "contract": {"answer": "#[contract]\nmod Example {...}"} - }) - - dspy.configure(lm=mock_lm) - # Run tests... -``` \ No newline at end of file diff --git a/.kiro/specs/agents-python-port/requirements.md b/.kiro/specs/agents-python-port/requirements.md deleted file mode 100644 index 0bb3d89f..00000000 --- a/.kiro/specs/agents-python-port/requirements.md +++ /dev/null @@ -1,341 +0,0 @@ -# Requirements Document - -## Introduction - -This document outlines the requirements for porting the Cairo Coder agents package from TypeScript to Python while maintaining compatibility with the existing backend and ingester components. The agents package implements a Retrieval-Augmented Generation (RAG) system specifically designed for Cairo programming language assistance, featuring multi-step AI workflows for query processing, document retrieval, and answer generation. - -## Requirements - -### Requirement 1: Microservice Communication Interface - -**User Story:** As a backend developer, I want the Python agents to run as a separate microservice that communicates with the TypeScript backend, so that I can leverage Python's AI ecosystem while maintaining the existing backend architecture. - -#### Acceptance Criteria - -1. WHEN the backend needs agent processing THEN it SHALL communicate with the Python microservice via HTTP/WebSocket API -2. WHEN the Python service processes a request THEN it SHALL stream responses back to the TypeScript backend in real-time -3. WHEN the agent processes a request THEN it SHALL send events with the same structure: `{'type': 'sources', 'data': documents}` and `{'type': 'response', 'data': content}` -4. WHEN the agent completes processing THEN it SHALL send an 'end' event -5. WHEN an error occurs THEN the agent SHALL send an 'error' event with error details -6. WHEN the TypeScript backend receives events THEN it SHALL convert them to EventEmitter events for backward compatibility - -### Requirement 2: RAG Pipeline Implementation - -**User Story:** As a system architect, I want the Python implementation to maintain the same RAG pipeline structure, so that the system behavior remains consistent. - -#### Acceptance Criteria - -1. WHEN a query is received THEN the system SHALL execute a three-stage pipeline: Query Processing → Document Retrieval → Answer Generation -2. WHEN processing a query THEN the system SHALL use the QueryProcessorProgram to transform the original query into search terms and identify relevant resources -3. WHEN retrieving documents THEN the system SHALL use the DocumentRetrieverProgram to fetch, rerank, and filter documents based on similarity thresholds -4. WHEN generating responses THEN the system SHALL use context from retrieved documents to generate Cairo-specific code solutions -5. WHEN in MCP mode THEN the system SHALL return raw document content instead of generated responses - -### Requirement 3: Agent Configuration System - -**User Story:** As a system administrator, I want to configure different agents with specific capabilities, so that I can provide specialized assistance for different use cases. - -#### Acceptance Criteria - -1. WHEN an agent is requested by ID THEN the system SHALL load the corresponding configuration including sources, templates, and parameters -2. WHEN no agent ID is provided THEN the system SHALL use the default 'cairo-coder' agent configuration -3. WHEN configuring an agent THEN the system SHALL support specifying document sources (cairo_book, starknet_docs, etc.), similarity thresholds, and maximum source counts -4. WHEN using agent templates THEN the system SHALL support contract and test templates for context enhancement -5. WHEN multiple agents are defined THEN the system SHALL support agent-specific retrieval and generation programs - -### Requirement 4: Vector Store Integration - -**User Story:** As a developer, I want the Python agents to integrate with the existing PostgreSQL vector store, so that document retrieval remains consistent. - -#### Acceptance Criteria - -1. WHEN performing similarity search THEN the system SHALL query the PostgreSQL vector store using the same table structure and indices -2. WHEN filtering by document sources THEN the system SHALL support filtering by DocumentSource enum values -3. WHEN computing embeddings THEN the system SHALL use the same embedding model (OpenAI text-embedding-3-large) for consistency -4. WHEN reranking documents THEN the system SHALL compute cosine similarity and filter by configurable thresholds -5. WHEN handling database errors THEN the system SHALL provide appropriate error handling and logging - -### Requirement 5: DSPy Framework Integration - -**User Story:** As an AI developer, I want the Python implementation to use the DSPy framework for structured AI programming, so that I can build modular and optimizable AI components instead of managing brittle prompt strings. - -#### Acceptance Criteria - -1. WHEN implementing AI components THEN the system SHALL use DSPy modules (Predict, ChainOfThought, ProgramOfThought) with structured signatures -2. WHEN defining signatures THEN the system SHALL use `dspy.Signature` classes with `InputField` and `OutputField` specifications: - ```python - class QueryTransformation(dspy.Signature): - """Transform a user query into search terms and identify relevant documentation sources.""" - chat_history = dspy.InputField(desc="Previous conversation context") - query = dspy.InputField(desc="User's Cairo programming question") - search_terms = dspy.OutputField(desc="List of search terms for retrieval") - resources = dspy.OutputField(desc="List of relevant documentation sources") - ``` -3. WHEN composing AI workflows THEN the system SHALL use `dspy.Module` base class and chain DSPy modules: - ```python - class RagPipeline(dspy.Module): - def __init__(self, config): - super().__init__() - self.query_processor = dspy.ChainOfThought(QueryTransformation) - self.document_retriever = DocumentRetriever(config) - self.answer_generator = dspy.ChainOfThought(AnswerGeneration) - - def forward(self, query, history): - # Chain modules together - processed = self.query_processor(query=query, chat_history=history) - docs = self.document_retriever(processed_query=processed, sources=processed.resources) - answer = self.answer_generator(query=query, context=docs, chat_history=history) - return answer - ``` -4. WHEN optimizing performance THEN the system SHALL support DSPy teleprompters (optimizers): - ```python - # Use MIPROv2 for automatic prompt optimization - optimizer = dspy.MIPROv2(metric=cairo_accuracy_metric, auto="medium") - optimized_pipeline = optimizer.compile( - program=rag_pipeline, - trainset=cairo_examples, - requires_permission_to_run=False - ) - - # Or use BootstrapFewShot for simpler optimization - optimizer = dspy.BootstrapFewShot(metric=cairo_accuracy_metric, max_bootstrapped_demos=4) - optimized_pipeline = optimizer.compile(rag_pipeline, trainset=cairo_examples) - ``` -5. WHEN saving/loading programs THEN the system SHALL use DSPy's serialization: - ```python - # Save optimized program with learned prompts and demonstrations - optimized_pipeline.save("optimized_cairo_rag.json") - - # Load for inference - pipeline = dspy.load("optimized_cairo_rag.json") - ``` - -### Requirement 6: Ax-to-DSPy Program Mapping - -**User Story:** As a system architect, I want each Ax Program from the TypeScript implementation to map 1-to-1 to a DSPy module, so that the AI workflow logic remains equivalent between implementations. - -#### Acceptance Criteria - -1. WHEN implementing QueryProcessorProgram THEN it SHALL map to a DSPy module using ChainOfThought: - ```python - class QueryProcessor(dspy.Module): - def __init__(self, retrieval_program): - super().__init__() - self.retrieval_program = retrieval_program - - def forward(self, chat_history: str, query: str) -> ProcessedQuery: - # Use the retrieval program (mapped from retrieval.program.ts) - result = self.retrieval_program(chat_history=chat_history, query=query) - - # Build ProcessedQuery matching TypeScript structure - return ProcessedQuery( - original=query, - transformed=result.search_terms, - is_contract_related=self._check_contract_related(query), - is_test_related=self._check_test_related(query), - resources=self._validate_resources(result.resources) - ) - ``` - -2. WHEN implementing DocumentRetrieverProgram THEN it SHALL map to a DSPy module maintaining the three-step process: - ```python - class DocumentRetriever(dspy.Module): - def __init__(self, config: RagSearchConfig): - super().__init__() - self.config = config - self.vector_store = config.vector_store - self.embedder = dspy.Embedder(model="text-embedding-3-large") - - async def forward(self, processed_query: ProcessedQuery, sources: List[DocumentSource]): - # Step 1: Fetch documents (maps to fetchDocuments) - docs = await self.vector_store.similarity_search( - query=processed_query.original, - k=self.config.max_source_count, - sources=sources - ) - - # Step 2: Rerank documents (maps to rerankDocuments) - query_embedding = await self.embedder.embed([processed_query.original]) - ranked_docs = self._rerank_by_similarity(docs, query_embedding[0]) - - # Step 3: Attach sources (maps to attachSources) - return self._attach_metadata(ranked_docs) - ``` - -3. WHEN implementing GenerationProgram THEN it SHALL use DSPy's ChainOfThought with reasoning: - ```python - class CairoGeneration(dspy.Signature): - """Generate Cairo smart contract code based on context and query.""" - chat_history = dspy.InputField(desc="Previous conversation context") - query = dspy.InputField(desc="User's Cairo programming question") - context = dspy.InputField(desc="Retrieved documentation and examples") - answer = dspy.OutputField(desc="Cairo code solution with explanation") - - # Maps to generation.program.ts - generation_program = dspy.ChainOfThought( - CairoGeneration, - rationale_field=dspy.OutputField( - prefix="Reasoning: Let me analyze the Cairo requirements step by step." - ) - ) - ``` - -4. WHEN implementing specialized Scarb programs THEN they SHALL use domain-specific signatures: - ```python - class ScarbRetrieval(dspy.Signature): - """Extract search terms for Scarb build tool queries.""" - chat_history = dspy.InputField(desc="optional", default="") - query = dspy.InputField() - search_terms = dspy.OutputField(desc="Scarb-specific search terms") - resources = dspy.OutputField(desc="Always includes 'scarb_docs'") - - class ScarbGeneration(dspy.Signature): - """Generate Scarb configuration and command guidance.""" - chat_history = dspy.InputField() - query = dspy.InputField() - context = dspy.InputField(desc="Scarb documentation context") - answer = dspy.OutputField(desc="Scarb commands, TOML configs, or troubleshooting") - ``` - -5. WHEN loading optimized configurations THEN the system SHALL support JSON demos: - ```python - # Load TypeScript-generated optimization data - if os.path.exists("demos/generation_demos.json"): - with open("demos/generation_demos.json") as f: - demos = json.load(f) - generation_program.demos = [dspy.Example(**demo) for demo in demos] - ``` - -### Requirement 7: LLM Provider Integration - -**User Story:** As a system integrator, I want the Python implementation to support the same LLM providers and models through DSPy's LM interface, so that response quality remains consistent. - -#### Acceptance Criteria - -1. WHEN configuring LLM providers THEN the system SHALL use DSPy's unified LM interface: - ```python - # Configure different providers - openai_lm = dspy.LM(model="openai/gpt-4o", api_key=config.openai_key) - anthropic_lm = dspy.LM(model="anthropic/claude-3-5-sonnet", api_key=config.anthropic_key) - gemini_lm = dspy.LM(model="google/gemini-1.5-pro", api_key=config.gemini_key) - - # Set default LM for all DSPy modules - dspy.configure(lm=openai_lm) - ``` - -2. WHEN implementing model routing THEN the system SHALL support provider selection: - ```python - class LLMRouter: - def __init__(self, config: Config): - self.providers = { - "openai": dspy.LM(model=config.openai_model, api_key=config.openai_key), - "anthropic": dspy.LM(model=config.anthropic_model, api_key=config.anthropic_key), - "gemini": dspy.LM(model=config.gemini_model, api_key=config.gemini_key) - } - self.default_provider = config.default_provider - - def get_lm(self, provider: Optional[str] = None) -> dspy.LM: - provider = provider or self.default_provider - return self.providers.get(provider, self.providers[self.default_provider]) - ``` - -3. WHEN streaming responses THEN the system SHALL use DSPy's streaming capabilities: - ```python - from dspy.utils import streamify - - async def stream_generation(pipeline: dspy.Module, query: str, history: List[Message]): - # Enable streaming for the pipeline - streaming_pipeline = streamify(pipeline) - - async for chunk in streaming_pipeline(query=query, history=history): - yield {"type": "response", "data": chunk} - ``` - -4. WHEN tracking usage THEN the system SHALL leverage DSPy's built-in tracking: - ```python - # DSPy automatically tracks usage for each LM call - response = pipeline(query=query, history=history) - - # Access usage information - usage_info = dspy.inspect_history(n=1) - tokens_used = usage_info[-1].get("usage", {}).get("total_tokens", 0) - - # Log usage for monitoring - logger.info(f"Tokens used: {tokens_used}") - ``` - -5. WHEN handling errors THEN the system SHALL use DSPy's error handling: - ```python - try: - response = pipeline(query=query, history=history) - except dspy.errors.LMError as e: - # Handle LLM-specific errors (rate limits, API failures) - logger.error(f"LLM error: {e}") - - # Retry with exponential backoff (built into DSPy) - response = pipeline.forward_with_retry( - query=query, - history=history, - max_retries=3 - ) - ``` - -### Requirement 8: Cairo-Specific Intelligence - -**User Story:** As a Cairo developer, I want the agents to provide accurate Cairo programming assistance, so that I can get relevant help for my coding tasks. - -#### Acceptance Criteria - -1. WHEN processing Cairo queries THEN the system SHALL identify contract-related and test-related queries for specialized handling -2. WHEN generating code THEN the system SHALL produce syntactically correct Cairo code following language conventions -3. WHEN using templates THEN the system SHALL apply contract and test templates to enhance context for specific query types -4. WHEN handling non-Cairo queries THEN the system SHALL respond with appropriate redirection messages -5. WHEN providing examples THEN the system SHALL include proper imports, interface definitions, and implementation patterns - -### Requirement 9: Event-Driven Architecture - -**User Story:** As a backend developer, I want the Python agents to maintain the same event-driven pattern, so that streaming responses work correctly. - -#### Acceptance Criteria - -1. WHEN processing requests THEN the system SHALL emit events asynchronously to allow for streaming responses -2. WHEN sources are retrieved THEN the system SHALL emit a 'sources' event before generating responses -3. WHEN generating responses THEN the system SHALL emit incremental 'response' events for streaming -4. WHEN processing completes THEN the system SHALL emit an 'end' event to signal completion -5. WHEN errors occur THEN the system SHALL emit 'error' events with descriptive error messages - -### Requirement 10: Configuration Management - -**User Story:** As a system administrator, I want the Python implementation to use the same configuration system, so that deployment and management remain consistent. - -#### Acceptance Criteria - -1. WHEN loading configuration THEN the system SHALL read from the same TOML configuration files -2. WHEN accessing API keys THEN the system SHALL support the same environment variable and configuration file structure -3. WHEN configuring providers THEN the system SHALL support the same provider selection and model mapping logic -4. WHEN setting parameters THEN the system SHALL support the same similarity thresholds, source counts, and other tunable parameters -5. WHEN handling missing configuration THEN the system SHALL provide appropriate defaults and error messages - -### Requirement 11: Logging and Observability - -**User Story:** As a system operator, I want the Python implementation to provide the same logging and monitoring capabilities, so that I can troubleshoot issues effectively. - -#### Acceptance Criteria - -1. WHEN processing requests THEN the system SHALL log query processing steps with appropriate detail levels -2. WHEN tracking performance THEN the system SHALL log token usage, response times, and document retrieval metrics -3. WHEN errors occur THEN the system SHALL log detailed error information including stack traces and context -4. WHEN debugging THEN the system SHALL support debug-level logging for detailed pipeline execution traces -5. WHEN monitoring THEN the system SHALL provide metrics compatible with existing monitoring infrastructure - -### Requirement 12: Testing and Quality Assurance - -**User Story:** As a quality assurance engineer, I want comprehensive testing capabilities, so that I can ensure the Python port maintains the same quality and behavior. - -#### Acceptance Criteria - -1. WHEN running unit tests THEN the system SHALL provide test coverage for all major components and workflows -2. WHEN testing agent behavior THEN the system SHALL support mocking of LLM providers and vector stores -3. WHEN validating responses THEN the system SHALL include tests for Cairo code generation quality and accuracy -4. WHEN testing error handling THEN the system SHALL verify appropriate error responses for various failure scenarios -5. WHEN performing integration tests THEN the system SHALL validate end-to-end workflows with real or mock dependencies diff --git a/.kiro/specs/agents-python-port/tasks.md b/.kiro/specs/agents-python-port/tasks.md deleted file mode 100644 index d27613c8..00000000 --- a/.kiro/specs/agents-python-port/tasks.md +++ /dev/null @@ -1,142 +0,0 @@ -# Implementation Plan - -- [ ] 1. Set up Python project structure and core dependencies - - Create Python package structure with proper module organization - - Set up pyproject.toml with DSPy, FastAPI, asyncpg, and other core dependencies - - Use `uv` as package manager, build system - - Use context7 if you need to understand how UV works. - - Configure development environment with linting, formatting, and testing tools - - _Requirements: 1.1, 10.1_ - -- [ ] 2. Implement core data models and type definitions - - Create Pydantic models for Message, ProcessedQuery, Document, RagInput, StreamEvent - - Implement DocumentSource enum with all source types - - Define RagSearchConfig and AgentConfiguration dataclasses - - Add type hints and validation for all data structures - - _Requirements: 1.3, 6.1_ - -- [ ] 3. Create configuration management system - - Implement ConfigManager class to load TOML configuration files - - Add environment variable support for API keys and database credentials - - Create agent configuration loading with fallback to defaults - - Add configuration validation and error handling - - _Requirements: 10.1, 10.2, 10.5_ - -- [ ] 4. Implement PostgreSQL vector store integration - - Create VectorStore class with asyncpg connection pooling - - Implement similarity_search method with vector cosine similarity - - Add document insertion and batch processing capabilities - - Implement source filtering and metadata handling - - Add database error handling and connection management - - _Requirements: 4.1, 4.2, 4.3, 4.4_ - -- [ ] 5. Create LLM provider router and integration - - Implement LLMRouter class supporting OpenAI, Anthropic, and Google Gemini - - Add model selection logic based on configuration - - Implement streaming response support for real-time generation - - Add token tracking and usage monitoring - - Implement retry logic and error handling for provider failures - - _Requirements: 7.1, 7.2, 7.3, 7.4, 7.5_ - -- [ ] 6. Implement DSPy QueryProcessorProgram - - Create QueryProcessorProgram as DSPy Module mapping from TypeScript version - - Define DSPy signature: "chat_history?, query -> search_terms, resources" - - Implement forward method to process queries and extract search terms - - Add Cairo/Starknet-specific query analysis logic - - Include few-shot examples for query processing optimization - - _Requirements: 2.2, 5.1, 6.1, 8.1_ - -- [ ] 7. Implement DSPy DocumentRetrieverProgram - - Create DocumentRetrieverProgram as DSPy Module for document retrieval - - Implement document fetching with multiple search terms - - Add document reranking using embedding similarity - - Implement source filtering and deduplication logic - - Add similarity threshold filtering and result limiting - - _Requirements: 2.3, 4.4, 6.2_ - -- [ ] 8. Implement DSPy GenerationProgram - - Create GenerationProgram using DSPy ChainOfThought for Cairo code generation - - Define signature: "chat_history?, query, context -> answer" - - Add Cairo-specific code generation instructions and examples - - Implement contract and test template integration - - Add streaming response support for incremental generation - - _Requirements: 2.4, 5.2, 6.3, 8.2, 8.3_ - -- [ ] 9. Create RAG Pipeline orchestration - - Implement RagPipeline class to orchestrate DSPy programs - - Add three-stage workflow: Query Processing → Document Retrieval → Generation - - Implement MCP mode for raw document return - - Add context building and template application logic - - Implement streaming event emission for real-time updates - - _Requirements: 2.1, 2.5, 9.1, 9.2, 9.3_ - -- [ ] 10. Implement Agent Factory - - Create AgentFactory class with static methods for agent creation - - Implement create_agent method for default agent configuration - - Add create_agent_by_id method for agent-specific configurations - - Load agent configurations and initialize RAG pipelines - - Add agent validation and error handling - - _Requirements: 3.1, 3.2, 3.3, 3.4_ - -- [ ] 11. Create FastAPI microservice server - - Set up FastAPI application with WebSocket support - - Implement /agents/process endpoint for agent requests - - Add request validation using Pydantic models - - Implement streaming response handling via WebSocket - - Add health check endpoints for monitoring - - _Requirements: 1.1, 1.2, 1.6_ - -- [ ] 12. Implement TypeScript backend integration layer - - Create Agent Factory Proxy in TypeScript to communicate with Python service - - Implement HTTP/WebSocket client for Python microservice communication - - Add EventEmitter adapter to convert streaming responses to events - - Modify existing chatCompletionHandler to use proxy instead of direct agent calls - - Maintain backward compatibility with existing API - - _Requirements: 1.1, 1.2, 1.6, 9.4_ - -- [ ] 13. Add comprehensive error handling and logging - - Implement structured error responses with appropriate HTTP status codes - - Add comprehensive logging for all pipeline stages - - Implement token usage tracking and performance metrics - - Add debug-level logging for troubleshooting - - Create error recovery mechanisms for transient failures - - _Requirements: 11.1, 11.2, 11.3, 11.4_ - -- [ ] 14. Create specialized agent implementations - - Implement Scarb Assistant agent with specialized retrieval and generation programs - - Add agent-specific DSPy program configurations - - Create agent templates for contract and test scenarios - - Add agent parameter customization (similarity thresholds, source counts) - - _Requirements: 3.3, 3.4, 6.4_ - -- [ ] 15. Implement comprehensive test suite - - Create unit tests for all DSPy programs with mocked LLM responses - - Add integration tests for complete RAG pipeline workflows - - Implement API endpoint tests for FastAPI server - - Create database integration tests with test PostgreSQL instance - - Add performance tests for throughput and latency measurement - - _Requirements: 12.1, 12.2, 12.3, 12.4, 12.5_ - -- [ ] 16. Add DSPy optimization and fine-tuning - - Implement DSPy optimizers (BootstrapRS, MIPROv2) for program improvement - - Create training datasets for few-shot learning optimization - - Add program compilation and optimization workflows - - Implement evaluation metrics for program performance - - Add automated optimization pipelines - - _Requirements: 5.4, 5.5_ - -- [ ] 17. Create deployment configuration and documentation - - Create Dockerfile for Python microservice containerization - - Add docker-compose configuration for local development - - Create deployment documentation with environment variable setup - - Add API documentation with OpenAPI/Swagger integration - - Create migration guide from TypeScript to Python implementation - - _Requirements: 10.3, 10.4_ - -- [ ] 18. Implement monitoring and observability - - Add Prometheus metrics for request counts, latencies, and error rates - - Implement distributed tracing for request flow monitoring - - Add health check endpoints for service monitoring - - Create alerting configuration for critical failures - - Add performance dashboards for system monitoring - - _Requirements: 11.5_ \ No newline at end of file diff --git a/design.md b/design.md deleted file mode 100644 index d82a1d00..00000000 --- a/design.md +++ /dev/null @@ -1,807 +0,0 @@ -# Design Document - -## Overview - -This document describes the design for porting the Cairo Coder agents package from TypeScript to Python using the DSPy framework. The design maintains the same RAG pipeline architecture while leveraging Python's AI ecosystem through a microservice approach that communicates with the existing TypeScript backend. - -## Architecture - -### High-Level Architecture - -```mermaid -graph TB - subgraph "TypeScript Backend" - A[Chat Completion Handler] --> B[Agent Factory Proxy] - B --> C[HTTP/WebSocket Client] - C --> D[Event Emitter Adapter] - end - - subgraph "Python Microservice" - E[FastAPI Server] --> F[Agent Factory] - F --> G[RAG Pipeline] - G --> H[Query Processor] - G --> I[Document Retriever] - G --> J[Response Generator] - end - - subgraph "Shared Infrastructure" - K[PostgreSQL Vector Store] - L[LLM Providers] - M[Configuration Files] - end - - C <--> E - I --> K - H --> L - J --> L - F --> M -``` - -### Communication Flow - -```mermaid -sequenceDiagram - participant TS as TypeScript Backend - participant PY as Python Microservice - participant VS as Vector Store - participant LLM as LLM Provider - - TS->>PY: POST /agents/process (query, history, agentId, mcpMode) - PY->>PY: Load Agent Configuration - PY->>LLM: Process Query (DSPy QueryProcessor) - PY->>VS: Similarity Search - PY->>PY: Rerank Documents - PY-->>TS: Stream: {"type": "sources", "data": [...]} - - alt MCP Mode - PY-->>TS: Stream: {"type": "response", "data": "raw_documents"} - else Normal Mode - PY->>LLM: Generate Response (DSPy Generator) - loop Streaming Response - PY-->>TS: Stream: {"type": "response", "data": "chunk"} - end - end - - PY-->>TS: Stream: {"type": "end"} -``` -## Components and Interfaces - -### 1. FastAPI Microservice Server - -**Purpose**: HTTP/WebSocket server that handles requests from TypeScript backend - -**Interface**: -```python -class AgentServer: - async def process_agent_request( - self, - query: str, - chat_history: List[Message], - agent_id: Optional[str] = None, - mcp_mode: bool = False - ) -> AsyncGenerator[Dict[str, Any], None] -``` - -**Key Features**: -- WebSocket support for real-time streaming -- Request validation and error handling -- CORS configuration for cross-origin requests -- Health check endpoints - -### 2. Agent Factory - -**Purpose**: Creates and configures agents based on agent ID or default configuration - -**Interface**: -```python -class AgentFactory: - @staticmethod - def create_agent( - query: str, - history: List[Message], - vector_store: VectorStore, - mcp_mode: bool = False - ) -> RagPipeline - - @staticmethod - async def create_agent_by_id( - query: str, - history: List[Message], - agent_id: str, - vector_store: VectorStore, - mcp_mode: bool = False - ) -> RagPipeline -``` - -### 3. RAG Pipeline (DSPy-based) - -**Purpose**: Orchestrates the three-stage RAG workflow using DSPy modules - -**Interface**: -```python -class RagPipeline(dspy.Module): - """Main pipeline that chains query processing, retrieval, and generation.""" - - def __init__(self, config: RagSearchConfig): - super().__init__() - self.config = config - - # Initialize DSPy modules for each stage - self.query_processor = QueryProcessor(config.retrieval_program) - self.document_retriever = DocumentRetriever(config) - self.response_generator = config.generation_program - - async def forward( - self, - query: str, - chat_history: List[Message], - mcp_mode: bool = False - ) -> AsyncGenerator[StreamEvent, None]: - """Execute the RAG pipeline with streaming support.""" - - # Stage 1: Process query - processed_query = self.query_processor( - query=query, - chat_history=self._format_history(chat_history) - ) - - # Stage 2: Retrieve documents - documents = await self.document_retriever( - processed_query=processed_query, - sources=self.config.sources - ) - - # Emit sources event - yield StreamEvent(type="sources", data=documents) - - if mcp_mode: - # Return raw documents in MCP mode - yield StreamEvent(type="response", data=self._format_documents(documents)) - else: - # Stage 3: Generate response - context = self._prepare_context(documents) - response = self.response_generator( - query=query, - chat_history=self._format_history(chat_history), - context=context - ) - - # Stream response chunks - for chunk in self._chunk_response(response.answer): - yield StreamEvent(type="response", data=chunk) - - yield StreamEvent(type="end", data=None) -``` -### 4. DSPy Program Mappings - -#### Query Processing Components - -**Retrieval Signature** (maps from retrieval.program.ts): -```python -class CairoQueryAnalysis(dspy.Signature): - """Analyze a Cairo programming query to extract search terms and identify relevant documentation sources.""" - - chat_history = dspy.InputField( - desc="Previous conversation context, may be empty", - default="" - ) - query = dspy.InputField( - desc="User's Cairo/Starknet programming question" - ) - search_terms = dspy.OutputField( - desc="List of specific search terms to find relevant documentation" - ) - resources = dspy.OutputField( - desc="List of documentation sources from: cairo_book, starknet_docs, starknet_foundry, cairo_by_example, openzeppelin_docs, corelib_docs, scarb_docs" - ) - -# Create the retrieval program -retrieval_program = dspy.ChainOfThought(CairoQueryAnalysis) -``` - -**QueryProcessor Module** (maps from queryProcessor.program.ts): -```python -class QueryProcessor(dspy.Module): - """Processes user queries into structured format for retrieval.""" - - def __init__(self, retrieval_program: dspy.Module): - super().__init__() - self.retrieval_program = retrieval_program - - def forward(self, query: str, chat_history: str = "") -> ProcessedQuery: - # Execute the retrieval program - result = self.retrieval_program( - query=query, - chat_history=chat_history - ) - - # Build ProcessedQuery matching TypeScript structure - return ProcessedQuery( - original=query, - transformed=result.search_terms, - is_contract_related=self._is_contract_query(query), - is_test_related=self._is_test_query(query), - resources=self._validate_resources(result.resources) - ) - - def _is_contract_query(self, query: str) -> bool: - """Check if query is about smart contracts.""" - contract_keywords = ['contract', 'interface', 'trait', 'impl', 'storage'] - return any(kw in query.lower() for kw in contract_keywords) - - def _is_test_query(self, query: str) -> bool: - """Check if query is about testing.""" - test_keywords = ['test', 'testing', 'assert', 'mock', 'fixture'] - return any(kw in query.lower() for kw in test_keywords) - - def _validate_resources(self, resources: List[str]) -> List[DocumentSource]: - """Validate and convert resource strings to DocumentSource enum.""" - valid_resources = [] - for r in resources: - try: - valid_resources.append(DocumentSource(r)) - except ValueError: - continue - return valid_resources or [DocumentSource.CAIRO_BOOK] # Default fallback -``` - -#### Document Retrieval Component - -**DocumentRetriever Module** (maps from documentRetriever.program.ts): -```python -class DocumentRetriever(dspy.Module): - """Retrieves and ranks relevant documents from vector store.""" - - def __init__(self, config: RagSearchConfig): - super().__init__() - self.config = config - self.vector_store = config.vector_store - self.embedder = dspy.Embedder(model="text-embedding-3-large") - - async def forward( - self, - processed_query: ProcessedQuery, - sources: List[DocumentSource] - ) -> List[Document]: - """Three-step retrieval process: fetch, rerank, attach metadata.""" - - # Step 1: Fetch documents (maps to fetchDocuments) - docs = await self._fetch_documents(processed_query, sources) - - # Step 2: Rerank documents (maps to rerankDocuments) - if docs: - docs = await self._rerank_documents(processed_query.original, docs) - - # Step 3: Attach sources (maps to attachSources) - return self._attach_sources(docs) - - async def _fetch_documents( - self, - processed_query: ProcessedQuery, - sources: List[DocumentSource] - ) -> List[Document]: - """Fetch documents from vector store.""" - return await self.vector_store.similarity_search( - query=processed_query.original, - k=self.config.max_source_count, - sources=sources - ) - - async def _rerank_documents( - self, - query: str, - docs: List[Document] - ) -> List[Document]: - """Rerank documents by cosine similarity.""" - # Get embeddings - query_embedding = await self.embedder.embed([query]) - doc_texts = [d.page_content for d in docs] - doc_embeddings = await self.embedder.embed(doc_texts) - - # Calculate similarities - similarities = [] - for doc_emb in doc_embeddings: - similarity = self._cosine_similarity(query_embedding[0], doc_emb) - similarities.append(similarity) - - # Filter by threshold and sort - ranked_docs = [ - (doc, sim) for doc, sim in zip(docs, similarities) - if sim >= self.config.similarity_threshold - ] - ranked_docs.sort(key=lambda x: x[1], reverse=True) - - return [doc for doc, _ in ranked_docs[:self.config.max_source_count]] - - def _cosine_similarity(self, a: List[float], b: List[float]) -> float: - """Calculate cosine similarity between two vectors.""" - import numpy as np - return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) - - def _attach_sources(self, docs: List[Document]) -> List[Document]: - """Attach metadata like title and URL to documents.""" - for doc in docs: - # Add source metadata based on document source - source = doc.metadata.get('source', '') - doc.metadata['title'] = self._get_title(doc) - doc.metadata['url'] = self._get_url(doc) - return docs -``` - -#### Generation Components - -**Cairo Generation Signature** (maps from generation.program.ts): -```python -class CairoCodeGeneration(dspy.Signature): - """Generate Cairo smart contract code based on context and user query.""" - - chat_history = dspy.InputField( - desc="Previous conversation context for continuity" - ) - query = dspy.InputField( - desc="User's specific Cairo programming question or request" - ) - context = dspy.InputField( - desc="Retrieved Cairo documentation, examples, and relevant information" - ) - answer = dspy.OutputField( - desc="Complete Cairo code solution with explanations, following Cairo syntax and best practices" - ) - -# Create generation program with Chain of Thought reasoning -generation_program = dspy.ChainOfThought( - CairoCodeGeneration, - rationale_field=dspy.OutputField( - prefix="Reasoning: Let me analyze the Cairo requirements step by step.", - desc="Step-by-step analysis of the Cairo programming task" - ) -) -``` - -**Scarb-specific Programs** (maps from scarb-*.program.ts): -```python -class ScarbQueryAnalysis(dspy.Signature): - """Analyze Scarb build tool queries to extract relevant search terms.""" - - chat_history = dspy.InputField(desc="Previous conversation", default="") - query = dspy.InputField(desc="User's Scarb-related question") - search_terms = dspy.OutputField( - desc="Scarb-specific search terms (commands, configuration, dependencies)" - ) - resources = dspy.OutputField( - desc="Always includes 'scarb_docs' as primary source" - ) - -class ScarbGeneration(dspy.Signature): - """Generate Scarb configuration, commands, and troubleshooting guidance.""" - - chat_history = dspy.InputField(desc="Previous conversation") - query = dspy.InputField(desc="User's Scarb question") - context = dspy.InputField(desc="Scarb documentation and examples") - answer = dspy.OutputField( - desc="Scarb commands, TOML configurations, or troubleshooting steps with proper formatting" - ) - -# Create Scarb-specific programs -scarb_retrieval_program = dspy.ChainOfThought(ScarbQueryAnalysis) -scarb_generation_program = dspy.ChainOfThought(ScarbGeneration) -``` - -#### Loading Optimized Configurations - -```python -def load_optimized_programs(programs_dir: str = "optimized_programs"): - """Load DSPy programs with pre-optimized prompts and demonstrations.""" - - programs = {} - - # Load each optimized program - for program_name in ['retrieval', 'generation', 'scarb_retrieval', 'scarb_generation']: - program_path = os.path.join(programs_dir, f"{program_name}.json") - - if os.path.exists(program_path): - # Load optimized program with learned prompts and demos - programs[program_name] = dspy.load(program_path) - else: - # Fallback to base programs - if program_name == 'retrieval': - programs[program_name] = retrieval_program - elif program_name == 'generation': - programs[program_name] = generation_program - elif program_name == 'scarb_retrieval': - programs[program_name] = scarb_retrieval_program - elif program_name == 'scarb_generation': - programs[program_name] = scarb_generation_program - - return programs -``` -### 5. Vector Store Integration - -**Purpose**: Interface with PostgreSQL vector database for document retrieval - -**Interface**: -```python -class VectorStore: - def __init__(self, config: VectorStoreConfig): - self.pool = asyncpg.create_pool(...) - self.embedding_client = OpenAIEmbeddings() - - async def similarity_search( - self, - query: str, - k: int = 5, - sources: Optional[Union[DocumentSource, List[DocumentSource]]] = None - ) -> List[Document] - - async def add_documents( - self, - documents: List[Document], - ids: Optional[List[str]] = None - ) -> None -``` - -### 6. LLM Configuration with DSPy - -**Purpose**: Configure and manage multiple LLM providers through DSPy's unified interface - -**Implementation**: -```python -class LLMConfig: - """Manages LLM configuration for DSPy.""" - - @staticmethod - def configure_providers(config: Config) -> Dict[str, dspy.LM]: - """Configure all available LLM providers.""" - providers = {} - - # Configure OpenAI - if config.openai_api_key: - providers['openai'] = dspy.LM( - model=config.openai_model or "openai/gpt-4o", - api_key=config.openai_api_key, - temperature=config.temperature - ) - - # Configure Anthropic - if config.anthropic_api_key: - providers['anthropic'] = dspy.LM( - model=config.anthropic_model or "anthropic/claude-3-5-sonnet", - api_key=config.anthropic_api_key, - temperature=config.temperature - ) - - # Configure Google Gemini - if config.gemini_api_key: - providers['gemini'] = dspy.LM( - model=config.gemini_model or "google/gemini-1.5-pro", - api_key=config.gemini_api_key, - temperature=config.temperature - ) - - return providers - - @staticmethod - def set_default_lm(providers: Dict[str, dspy.LM], default: str = "openai"): - """Set the default LM for all DSPy operations.""" - if default in providers: - dspy.configure(lm=providers[default]) - elif providers: - # Fallback to first available provider - dspy.configure(lm=next(iter(providers.values()))) - else: - raise ValueError("No LLM providers configured") - -# Usage in initialization -class AgentInitializer: - def __init__(self, config: Config): - # Configure LLM providers - self.providers = LLMConfig.configure_providers(config) - LLMConfig.set_default_lm(self.providers, config.default_provider) - - # Configure embeddings separately if needed - self.embedder = dspy.Embedder( - model=config.embedding_model or "text-embedding-3-large", - api_key=config.openai_api_key # Embeddings typically use OpenAI - ) -``` - -**Streaming Support**: -```python -from dspy.utils import streamify - -class StreamingPipeline: - """Wrapper for streaming DSPy module responses.""" - - def __init__(self, module: dspy.Module): - self.module = module - self.streaming_module = streamify(module) - - async def stream_response( - self, - **kwargs - ) -> AsyncGenerator[str, None]: - """Stream response chunks from the module.""" - async for chunk in self.streaming_module(**kwargs): - yield chunk -``` - -### 7. Configuration Management - -**Purpose**: Load and manage configuration from TOML files and environment variables - -**Interface**: -```python -class ConfigManager: - @staticmethod - def load_config() -> Config: - # Load from config.toml and environment variables - pass - - @staticmethod - def get_agent_config(agent_id: str) -> AgentConfiguration: - # Load agent-specific configuration - pass -```## Da -ta Models - -### Core Data Structures - -```python -@dataclass -class ProcessedQuery: - original: str - transformed: Union[str, List[str]] - is_contract_related: bool = False - is_test_related: bool = False - resources: List[DocumentSource] = field(default_factory=list) - -@dataclass -class Document: - page_content: str - metadata: Dict[str, Any] - -@dataclass -class RagInput: - query: str - chat_history: List[Message] - sources: Union[DocumentSource, List[DocumentSource]] - -@dataclass -class StreamEvent: - type: str # "sources", "response", "end", "error" - data: Any - -@dataclass -class RagSearchConfig: - name: str - vector_store: VectorStore - contract_template: Optional[str] = None - test_template: Optional[str] = None - max_source_count: int = 10 - similarity_threshold: float = 0.4 - sources: Union[DocumentSource, List[DocumentSource]] = None - retrieval_program: dspy.Module = None - generation_program: dspy.Module = None - -class DocumentSource(Enum): - CAIRO_BOOK = "cairo_book" - STARKNET_DOCS = "starknet_docs" - STARKNET_FOUNDRY = "starknet_foundry" - CAIRO_BY_EXAMPLE = "cairo_by_example" - OPENZEPPELIN_DOCS = "openzeppelin_docs" - CORELIB_DOCS = "corelib_docs" - SCARB_DOCS = "scarb_docs" -``` -## Error Handling -### Error Categories - -1. **Configuration Errors**: Missing API keys, invalid agent IDs -2. **Database Errors**: Connection failures, query errors -3. **LLM Provider Errors**: Rate limits, API failures -4. **Validation Errors**: Invalid input parameters -5. **Processing Errors**: Pipeline execution failures - -### Error Response Format - -```python -@dataclass -class ErrorResponse: - type: str # "configuration_error", "database_error", etc. - message: str - details: Optional[Dict[str, Any]] = None - timestamp: datetime = field(default_factory=datetime.now) -``` - -## Testing Strategy - -### Unit Testing with DSPy - -**Testing DSPy Modules**: -```python -import pytest -import dspy -from unittest.mock import Mock, patch - -class TestQueryProcessor: - @pytest.fixture - def mock_lm(self): - """Configure DSPy with a mock LM for testing.""" - mock = Mock() - mock.return_value = dspy.Prediction( - search_terms=["cairo", "contract", "storage"], - resources=["cairo_book", "starknet_docs"] - ) - dspy.configure(lm=mock) - return mock - - def test_query_processing(self, mock_lm): - """Test query processor extracts correct search terms.""" - processor = QueryProcessor(retrieval_program) - result = processor( - query="How do I define storage in a Cairo contract?", - chat_history="" - ) - - assert result.is_contract_related == True - assert "cairo_book" in [r.value for r in result.resources] - assert len(result.transformed) > 0 - -class TestDocumentRetriever: - @pytest.mark.asyncio - async def test_document_ranking(self): - """Test document reranking by similarity.""" - # Mock vector store - mock_store = Mock() - mock_store.similarity_search.return_value = [ - Document(page_content="Cairo storage guide", metadata={"score": 0.9}), - Document(page_content="Irrelevant content", metadata={"score": 0.3}) - ] - - config = RagSearchConfig( - name="test", - vector_store=mock_store, - similarity_threshold=0.5 - ) - - retriever = DocumentRetriever(config) - # Test retrieval and ranking - # ... -``` - -**Testing with DSPy Assertions**: -```python -def test_generation_quality(): - """Test generation produces valid Cairo code.""" - # Create test examples - examples = [ - dspy.Example( - query="Write a simple Cairo contract", - context="Cairo contracts use #[contract] attribute...", - answer="#[contract]\nmod SimpleContract {\n ..." - ).with_inputs("query", "context") - ] - - # Use DSPy's evaluation tools - evaluator = dspy.Evaluate( - devset=examples, - metric=cairo_code_validity_metric - ) - - score = evaluator(generation_program) - assert score > 0.8 # 80% accuracy threshold -``` - -### Integration Testing - -**End-to-End Pipeline Test**: -```python -@pytest.mark.integration -class TestRagPipeline: - async def test_full_pipeline_flow(self): - """Test complete RAG pipeline execution.""" - # Configure test environment - dspy.configure(lm=dspy.LM("openai/gpt-3.5-turbo", api_key="test")) - - # Create pipeline with test config - config = RagSearchConfig( - name="test_agent", - vector_store=test_vector_store, - retrieval_program=retrieval_program, - generation_program=generation_program - ) - - pipeline = RagPipeline(config) - - # Execute pipeline - events = [] - async for event in pipeline.forward( - query="How to create a Cairo contract?", - chat_history=[] - ): - events.append(event) - - # Verify event sequence - assert events[0].type == "sources" - assert any(e.type == "response" for e in events) - assert events[-1].type == "end" -``` - -### Performance Testing with DSPy - -**Optimization and Benchmarking**: -```python -class PerformanceTests: - def test_pipeline_optimization(self): - """Test and optimize pipeline performance.""" - # Create training set for optimization - trainset = load_cairo_training_examples() - - # Optimize with MIPROv2 - optimizer = dspy.MIPROv2( - metric=cairo_accuracy_metric, - auto="light" # Fast optimization for testing - ) - - # Measure optimization time - start_time = time.time() - optimized = optimizer.compile( - pipeline, - trainset=trainset[:50] # Subset for testing - ) - optimization_time = time.time() - start_time - - assert optimization_time < 300 # Should complete within 5 minutes - - # Benchmark optimized vs unoptimized - unopt_score = evaluate_pipeline(pipeline, testset) - opt_score = evaluate_pipeline(optimized, testset) - - assert opt_score > unopt_score # Optimization should improve performance - - @pytest.mark.benchmark - def test_request_throughput(self, benchmark): - """Benchmark request processing throughput.""" - pipeline = create_test_pipeline() - - async def process_request(): - async for _ in pipeline.forward( - query="Simple Cairo query", - chat_history=[] - ): - pass - - # Run benchmark - result = benchmark(asyncio.run, process_request) - - # Assert performance requirements - assert result.stats['mean'] < 2.0 # Average < 2 seconds -``` - -### Mock Strategies for DSPy - -```python -class MockDSPyLM: - """Mock LM for testing without API calls.""" - - def __init__(self, responses: Dict[str, Any]): - self.responses = responses - self.call_count = 0 - - def __call__(self, prompt: str, **kwargs): - self.call_count += 1 - # Return predetermined responses based on prompt content - for key, response in self.responses.items(): - if key in prompt: - return dspy.Prediction(**response) - return dspy.Prediction(answer="Default response") - -# Usage in tests -def test_with_mock_lm(): - mock_lm = MockDSPyLM({ - "storage": {"search_terms": ["storage", "variable"], "resources": ["cairo_book"]}, - "contract": {"answer": "#[contract]\nmod Example {...}"} - }) - - dspy.configure(lm=mock_lm) - # Run tests... -``` diff --git a/requirements.md b/requirements.md deleted file mode 100644 index 668d12f7..00000000 --- a/requirements.md +++ /dev/null @@ -1,341 +0,0 @@ -# Requirements Document - -## Introduction - -This document outlines the requirements for porting the Cairo Coder agents package from TypeScript to Python while maintaining compatibility with the existing backend and ingester components. The agents package implements a Retrieval-Augmented Generation (RAG) system specifically designed for Cairo programming language assistance, featuring multi-step AI workflows for query processing, document retrieval, and answer generation. - -## Requirements - -### Requirement 1: Microservice Communication Interface - -**User Story:** As a backend developer, I want the Python agents to run as a separate microservice that communicates with the TypeScript backend, so that I can leverage Python's AI ecosystem while maintaining the existing backend architecture. - -#### Acceptance Criteria - -1. WHEN the backend needs agent processing THEN it SHALL communicate with the Python microservice via HTTP/WebSocket API -2. WHEN the Python service processes a request THEN it SHALL stream responses back to the TypeScript backend in real-time -3. WHEN the agent processes a request THEN it SHALL send events with the same structure: `{'type': 'sources', 'data': documents}` and `{'type': 'response', 'data': content}` -4. WHEN the agent completes processing THEN it SHALL send an 'end' event -5. WHEN an error occurs THEN the agent SHALL send an 'error' event with error details -6. WHEN the TypeScript backend receives events THEN it SHALL convert them to EventEmitter events for backward compatibility - -### Requirement 2: RAG Pipeline Implementation - -**User Story:** As a system architect, I want the Python implementation to maintain the same RAG pipeline structure, so that the system behavior remains consistent. - -#### Acceptance Criteria - -1. WHEN a query is received THEN the system SHALL execute a three-stage pipeline: Query Processing → Document Retrieval → Answer Generation -2. WHEN processing a query THEN the system SHALL use the QueryProcessorProgram to transform the original query into search terms and identify relevant resources -3. WHEN retrieving documents THEN the system SHALL use the DocumentRetrieverProgram to fetch, rerank, and filter documents based on similarity thresholds -4. WHEN generating responses THEN the system SHALL use context from retrieved documents to generate Cairo-specific code solutions -5. WHEN in MCP mode THEN the system SHALL return raw document content instead of generated responses - -### Requirement 3: Agent Configuration System - -**User Story:** As a system administrator, I want to configure different agents with specific capabilities, so that I can provide specialized assistance for different use cases. - -#### Acceptance Criteria - -1. WHEN an agent is requested by ID THEN the system SHALL load the corresponding configuration including sources, templates, and parameters -2. WHEN no agent ID is provided THEN the system SHALL use the default 'cairo-coder' agent configuration -3. WHEN configuring an agent THEN the system SHALL support specifying document sources (cairo_book, starknet_docs, etc.), similarity thresholds, and maximum source counts -4. WHEN using agent templates THEN the system SHALL support contract and test templates for context enhancement -5. WHEN multiple agents are defined THEN the system SHALL support agent-specific retrieval and generation programs - -### Requirement 4: Vector Store Integration - -**User Story:** As a developer, I want the Python agents to integrate with the existing PostgreSQL vector store, so that document retrieval remains consistent. - -#### Acceptance Criteria - -1. WHEN performing similarity search THEN the system SHALL query the PostgreSQL vector store using the same table structure and indices -2. WHEN filtering by document sources THEN the system SHALL support filtering by DocumentSource enum values -3. WHEN computing embeddings THEN the system SHALL use the same embedding model (OpenAI text-embedding-3-large) for consistency -4. WHEN reranking documents THEN the system SHALL compute cosine similarity and filter by configurable thresholds -5. WHEN handling database errors THEN the system SHALL provide appropriate error handling and logging - -### Requirement 5: DSPy Framework Integration - -**User Story:** As an AI developer, I want the Python implementation to use the DSPy framework for structured AI programming, so that I can build modular and optimizable AI components instead of managing brittle prompt strings. - -#### Acceptance Criteria - -1. WHEN implementing AI components THEN the system SHALL use DSPy modules (Predict, ChainOfThought, ProgramOfThought) with structured signatures -2. WHEN defining signatures THEN the system SHALL use `dspy.Signature` classes with `InputField` and `OutputField` specifications: - ```python - class QueryTransformation(dspy.Signature): - """Transform a user query into search terms and identify relevant documentation sources.""" - chat_history = dspy.InputField(desc="Previous conversation context") - query = dspy.InputField(desc="User's Cairo programming question") - search_terms = dspy.OutputField(desc="List of search terms for retrieval") - resources = dspy.OutputField(desc="List of relevant documentation sources") - ``` -3. WHEN composing AI workflows THEN the system SHALL use `dspy.Module` base class and chain DSPy modules: - ```python - class RagPipeline(dspy.Module): - def __init__(self, config): - super().__init__() - self.query_processor = dspy.ChainOfThought(QueryTransformation) - self.document_retriever = DocumentRetriever(config) - self.answer_generator = dspy.ChainOfThought(AnswerGeneration) - - def forward(self, query, history): - # Chain modules together - processed = self.query_processor(query=query, chat_history=history) - docs = self.document_retriever(processed_query=processed, sources=processed.resources) - answer = self.answer_generator(query=query, context=docs, chat_history=history) - return answer - ``` -4. WHEN optimizing performance THEN the system SHALL support DSPy teleprompters (optimizers): - ```python - # Use MIPROv2 for automatic prompt optimization - optimizer = dspy.MIPROv2(metric=cairo_accuracy_metric, auto="medium") - optimized_pipeline = optimizer.compile( - program=rag_pipeline, - trainset=cairo_examples, - requires_permission_to_run=False - ) - - # Or use BootstrapFewShot for simpler optimization - optimizer = dspy.BootstrapFewShot(metric=cairo_accuracy_metric, max_bootstrapped_demos=4) - optimized_pipeline = optimizer.compile(rag_pipeline, trainset=cairo_examples) - ``` -5. WHEN saving/loading programs THEN the system SHALL use DSPy's serialization: - ```python - # Save optimized program with learned prompts and demonstrations - optimized_pipeline.save("optimized_cairo_rag.json") - - # Load for inference - pipeline = dspy.load("optimized_cairo_rag.json") - ``` - -### Requirement 6: Ax-to-DSPy Program Mapping - -**User Story:** As a system architect, I want each Ax Program from the TypeScript implementation to map 1-to-1 to a DSPy module, so that the AI workflow logic remains equivalent between implementations. - -#### Acceptance Criteria - -1. WHEN implementing QueryProcessorProgram THEN it SHALL map to a DSPy module using ChainOfThought: - ```python - class QueryProcessor(dspy.Module): - def __init__(self, retrieval_program): - super().__init__() - self.retrieval_program = retrieval_program - - def forward(self, chat_history: str, query: str) -> ProcessedQuery: - # Use the retrieval program (mapped from retrieval.program.ts) - result = self.retrieval_program(chat_history=chat_history, query=query) - - # Build ProcessedQuery matching TypeScript structure - return ProcessedQuery( - original=query, - transformed=result.search_terms, - is_contract_related=self._check_contract_related(query), - is_test_related=self._check_test_related(query), - resources=self._validate_resources(result.resources) - ) - ``` - -2. WHEN implementing DocumentRetrieverProgram THEN it SHALL map to a DSPy module maintaining the three-step process: - ```python - class DocumentRetriever(dspy.Module): - def __init__(self, config: RagSearchConfig): - super().__init__() - self.config = config - self.vector_store = config.vector_store - self.embedder = dspy.Embedder(model="text-embedding-3-large") - - async def forward(self, processed_query: ProcessedQuery, sources: List[DocumentSource]): - # Step 1: Fetch documents (maps to fetchDocuments) - docs = await self.vector_store.similarity_search( - query=processed_query.original, - k=self.config.max_source_count, - sources=sources - ) - - # Step 2: Rerank documents (maps to rerankDocuments) - query_embedding = await self.embedder.embed([processed_query.original]) - ranked_docs = self._rerank_by_similarity(docs, query_embedding[0]) - - # Step 3: Attach sources (maps to attachSources) - return self._attach_metadata(ranked_docs) - ``` - -3. WHEN implementing GenerationProgram THEN it SHALL use DSPy's ChainOfThought with reasoning: - ```python - class CairoGeneration(dspy.Signature): - """Generate Cairo smart contract code based on context and query.""" - chat_history = dspy.InputField(desc="Previous conversation context") - query = dspy.InputField(desc="User's Cairo programming question") - context = dspy.InputField(desc="Retrieved documentation and examples") - answer = dspy.OutputField(desc="Cairo code solution with explanation") - - # Maps to generation.program.ts - generation_program = dspy.ChainOfThought( - CairoGeneration, - rationale_field=dspy.OutputField( - prefix="Reasoning: Let me analyze the Cairo requirements step by step." - ) - ) - ``` - -4. WHEN implementing specialized Scarb programs THEN they SHALL use domain-specific signatures: - ```python - class ScarbRetrieval(dspy.Signature): - """Extract search terms for Scarb build tool queries.""" - chat_history = dspy.InputField(desc="optional", default="") - query = dspy.InputField() - search_terms = dspy.OutputField(desc="Scarb-specific search terms") - resources = dspy.OutputField(desc="Always includes 'scarb_docs'") - - class ScarbGeneration(dspy.Signature): - """Generate Scarb configuration and command guidance.""" - chat_history = dspy.InputField() - query = dspy.InputField() - context = dspy.InputField(desc="Scarb documentation context") - answer = dspy.OutputField(desc="Scarb commands, TOML configs, or troubleshooting") - ``` - -5. WHEN loading optimized configurations THEN the system SHALL support JSON demos: - ```python - # Load TypeScript-generated optimization data - if os.path.exists("demos/generation_demos.json"): - with open("demos/generation_demos.json") as f: - demos = json.load(f) - generation_program.demos = [dspy.Example(**demo) for demo in demos] - ``` - -### Requirement 7: LLM Provider Integration - -**User Story:** As a system integrator, I want the Python implementation to support the same LLM providers and models through DSPy's LM interface, so that response quality remains consistent. - -#### Acceptance Criteria - -1. WHEN configuring LLM providers THEN the system SHALL use DSPy's unified LM interface: - ```python - # Configure different providers - openai_lm = dspy.LM(model="openai/gpt-4o", api_key=config.openai_key) - anthropic_lm = dspy.LM(model="anthropic/claude-3-5-sonnet", api_key=config.anthropic_key) - gemini_lm = dspy.LM(model="google/gemini-1.5-pro", api_key=config.gemini_key) - - # Set default LM for all DSPy modules - dspy.configure(lm=openai_lm) - ``` - -2. WHEN implementing model routing THEN the system SHALL support provider selection: - ```python - class LLMRouter: - def __init__(self, config: Config): - self.providers = { - "openai": dspy.LM(model=config.openai_model, api_key=config.openai_key), - "anthropic": dspy.LM(model=config.anthropic_model, api_key=config.anthropic_key), - "gemini": dspy.LM(model=config.gemini_model, api_key=config.gemini_key) - } - self.default_provider = config.default_provider - - def get_lm(self, provider: Optional[str] = None) -> dspy.LM: - provider = provider or self.default_provider - return self.providers.get(provider, self.providers[self.default_provider]) - ``` - -3. WHEN streaming responses THEN the system SHALL use DSPy's streaming capabilities: - ```python - from dspy.utils import streamify - - async def stream_generation(pipeline: dspy.Module, query: str, history: List[Message]): - # Enable streaming for the pipeline - streaming_pipeline = streamify(pipeline) - - async for chunk in streaming_pipeline(query=query, history=history): - yield {"type": "response", "data": chunk} - ``` - -4. WHEN tracking usage THEN the system SHALL leverage DSPy's built-in tracking: - ```python - # DSPy automatically tracks usage for each LM call - response = pipeline(query=query, history=history) - - # Access usage information - usage_info = dspy.inspect_history(n=1) - tokens_used = usage_info[-1].get("usage", {}).get("total_tokens", 0) - - # Log usage for monitoring - logger.info(f"Tokens used: {tokens_used}") - ``` - -5. WHEN handling errors THEN the system SHALL use DSPy's error handling: - ```python - try: - response = pipeline(query=query, history=history) - except dspy.errors.LMError as e: - # Handle LLM-specific errors (rate limits, API failures) - logger.error(f"LLM error: {e}") - - # Retry with exponential backoff (built into DSPy) - response = pipeline.forward_with_retry( - query=query, - history=history, - max_retries=3 - ) - ``` - -### Requirement 8: Cairo-Specific Intelligence - -**User Story:** As a Cairo developer, I want the agents to provide accurate Cairo programming assistance, so that I can get relevant help for my coding tasks. - -#### Acceptance Criteria - -1. WHEN processing Cairo queries THEN the system SHALL identify contract-related and test-related queries for specialized handling -2. WHEN generating code THEN the system SHALL produce syntactically correct Cairo code following language conventions -3. WHEN using templates THEN the system SHALL apply contract and test templates to enhance context for specific query types -4. WHEN handling non-Cairo queries THEN the system SHALL respond with appropriate redirection messages -5. WHEN providing examples THEN the system SHALL include proper imports, interface definitions, and implementation patterns - -### Requirement 9: Event-Driven Architecture - -**User Story:** As a backend developer, I want the Python agents to maintain the same event-driven pattern, so that streaming responses work correctly. - -#### Acceptance Criteria - -1. WHEN processing requests THEN the system SHALL emit events asynchronously to allow for streaming responses -2. WHEN sources are retrieved THEN the system SHALL emit a 'sources' event before generating responses -3. WHEN generating responses THEN the system SHALL emit incremental 'response' events for streaming -4. WHEN processing completes THEN the system SHALL emit an 'end' event to signal completion -5. WHEN errors occur THEN the system SHALL emit 'error' events with descriptive error messages - -### Requirement 10: Configuration Management - -**User Story:** As a system administrator, I want the Python implementation to use the same configuration system, so that deployment and management remain consistent. - -#### Acceptance Criteria - -1. WHEN loading configuration THEN the system SHALL read from the same TOML configuration files -2. WHEN accessing API keys THEN the system SHALL support the same environment variable and configuration file structure -3. WHEN configuring providers THEN the system SHALL support the same provider selection and model mapping logic -4. WHEN setting parameters THEN the system SHALL support the same similarity thresholds, source counts, and other tunable parameters -5. WHEN handling missing configuration THEN the system SHALL provide appropriate defaults and error messages - -### Requirement 11: Logging and Observability - -**User Story:** As a system operator, I want the Python implementation to provide the same logging and monitoring capabilities, so that I can troubleshoot issues effectively. - -#### Acceptance Criteria - -1. WHEN processing requests THEN the system SHALL log query processing steps with appropriate detail levels -2. WHEN tracking performance THEN the system SHALL log token usage, response times, and document retrieval metrics -3. WHEN errors occur THEN the system SHALL log detailed error information including stack traces and context -4. WHEN debugging THEN the system SHALL support debug-level logging for detailed pipeline execution traces -5. WHEN monitoring THEN the system SHALL provide metrics compatible with existing monitoring infrastructure - -### Requirement 12: Testing and Quality Assurance - -**User Story:** As a quality assurance engineer, I want comprehensive testing capabilities, so that I can ensure the Python port maintains the same quality and behavior. - -#### Acceptance Criteria - -1. WHEN running unit tests THEN the system SHALL provide test coverage for all major components and workflows -2. WHEN testing agent behavior THEN the system SHALL support mocking of LLM providers and vector stores -3. WHEN validating responses THEN the system SHALL include tests for Cairo code generation quality and accuracy -4. WHEN testing error handling THEN the system SHALL verify appropriate error responses for various failure scenarios -5. WHEN performing integration tests THEN the system SHALL validate end-to-end workflows with real or mock dependencies From bfc4058605d6c085cdc8bfa69d2d9e68e01223a7 Mon Sep 17 00:00:00 2001 From: enitrat Date: Sun, 27 Jul 2025 13:24:46 +0200 Subject: [PATCH 3/3] feat: better markdown splitter --- .../src/ingesters/CairoBookIngester.ts | 141 +--- .../src/ingesters/CoreLibDocsIngester.ts | 126 ++- .../src/utils/RecursiveMarkdownSplitter.ts | 749 ++++++++++++++++++ ...cursiveMarkdownSplitter.finalChunk.test.ts | 169 ++++ ...RecursiveMarkdownSplitter.minChars.test.ts | 135 ++++ ...iveMarkdownSplitter.reconstruction.test.ts | 433 ++++++++++ .../RecursiveMarkdownSplitter.test.ts | 544 +++++++++++++ .../cairo_coder/optimizers/mcp_optimizer.py | 7 +- 8 files changed, 2121 insertions(+), 183 deletions(-) create mode 100644 packages/ingester/src/utils/RecursiveMarkdownSplitter.ts create mode 100644 packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.finalChunk.test.ts create mode 100644 packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.minChars.test.ts create mode 100644 packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.reconstruction.test.ts create mode 100644 packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts diff --git a/packages/ingester/src/ingesters/CairoBookIngester.ts b/packages/ingester/src/ingesters/CairoBookIngester.ts index 188b2d12..ebf14e2f 100644 --- a/packages/ingester/src/ingesters/CairoBookIngester.ts +++ b/packages/ingester/src/ingesters/CairoBookIngester.ts @@ -10,11 +10,11 @@ import { VectorStore } from '@cairo-coder/agents/db/postgresVectorStore'; import { logger } from '@cairo-coder/agents/utils/index'; import * as fs from 'fs/promises'; import * as path from 'path'; +import { calculateHash } from '../utils/contentUtils'; import { - addSectionWithSizeLimit, - calculateHash, - createAnchor, -} from '../utils/contentUtils'; + RecursiveMarkdownSplitter, + SplitOptions, +} from '../utils/RecursiveMarkdownSplitter'; /** * Ingester for the Cairo Book documentation @@ -63,109 +63,50 @@ export class CairoBookIngester extends MarkdownIngester { } /** - * Chunk the core library summary file by H1 headers + * Chunk the core library summary file using RecursiveMarkdownSplitter * - * This function takes the markdown content and splits it into sections - * based on H1 headers (# Header). Each section becomes a separate chunk - * with its content hashed for uniqueness. + * This function takes the markdown content and splits it using a recursive + * strategy that respects headers, code blocks, and maintains overlap between chunks. * * @param text - The markdown content to chunk - * @returns Promise[]> - Array of document chunks, one per H1 section + * @returns Promise[]> - Array of document chunks */ async chunkSummaryFile(text: string): Promise[]> { - const content = text; - const sections: ParsedSection[] = []; - - // We can't use a simple global regex, as it will incorrectly match commented - // lines inside code blocks. Instead, we'll parse line-by-line to find - // "real" headers, while keeping track of whether we're inside a code block. - - const realHeaders: { title: string; startIndex: number }[] = []; - const lines = content.split('\n'); - let inCodeBlock = false; - let charIndex = 0; - - for (const line of lines) { - // Toggle the state if we encounter a code block fence - if (line.trim().startsWith('```')) { - inCodeBlock = !inCodeBlock; - } - - // A real H1 header is a line that starts with '# ' and is NOT in a code block. - // We use a specific regex to ensure it's a proper H1. - const h1Match = line.match(/^#{1,2}\s+(.+)$/); - if (!inCodeBlock && h1Match) { - realHeaders.push({ - title: h1Match[1].trim(), - startIndex: charIndex, - }); - } - - // Move the character index forward, accounting for the newline character - charIndex += line.length + 1; - } + // Configure the splitter with appropriate settings + const splitOptions: SplitOptions = { + maxChars: 2048, + minChars: 500, + overlap: 256, + headerLevels: [1, 2], // Split on H1 and H2 headers + preserveCodeBlocks: true, + idPrefix: 'cairo-book', + trim: true, + }; - // If no H1 headers were found, treat the entire content as one section. - if (realHeaders.length === 0) { - logger.debug( - 'No H1 headers found, creating single section from entire content', - ); - addSectionWithSizeLimit( - sections, - 'Core Library Documentation', - content.trim(), - 20000, - createAnchor('Core Library Documentation'), - ); - } else { - // Process each valid H1 header found - for (let i = 0; i < realHeaders.length; i++) { - const header = realHeaders[i]; - const headerTitle = header.title; - const headerStartIndex = header.startIndex; - - // Determine the end of this section (start of next header or end of content) - const nextHeaderIndex = - i < realHeaders.length - 1 - ? realHeaders[i + 1].startIndex - : content.length; - - // Extract section content from the start of the header line to before the next header - const sectionContent = content - .slice(headerStartIndex, nextHeaderIndex) - .trim(); - - logger.debug(`Adding section: ${headerTitle}`); - - addSectionWithSizeLimit( - sections, - headerTitle, - sectionContent, - 20000, - createAnchor(headerTitle), - ); - } - } + // Create the splitter and split the content + const splitter = new RecursiveMarkdownSplitter(splitOptions); + const chunks = splitter.splitMarkdownToChunks(text); - const localChunks: Document[] = []; - - // Create a document for each section - sections.forEach((section: ParsedSection, index: number) => { - const hash: string = calculateHash(section.content); - localChunks.push( - new Document({ - pageContent: section.content, - metadata: { - name: section.title, - title: section.title, - chunkNumber: index, - contentHash: hash, - uniqueId: `${section.title}-${index}`, - sourceLink: ``, - source: this.source, // Using placeholder for 'this.source' - }, - }), - ); + logger.info( + `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`, + ); + + // Convert chunks to Document format + const localChunks: Document[] = chunks.map((chunk) => { + const contentHash = calculateHash(chunk.content); + + return new Document({ + pageContent: chunk.content, + metadata: { + name: chunk.meta.title, + title: chunk.meta.title, + chunkNumber: chunk.meta.chunkNumber, // Already 0-based + contentHash: contentHash, + uniqueId: chunk.meta.uniqueId, + sourceLink: '', + source: this.source, + }, + }); }); return localChunks; diff --git a/packages/ingester/src/ingesters/CoreLibDocsIngester.ts b/packages/ingester/src/ingesters/CoreLibDocsIngester.ts index 7162acee..0f78c358 100644 --- a/packages/ingester/src/ingesters/CoreLibDocsIngester.ts +++ b/packages/ingester/src/ingesters/CoreLibDocsIngester.ts @@ -2,19 +2,15 @@ import * as fs from 'fs/promises'; import * as path from 'path'; import { BookConfig } from '../utils/types'; import { MarkdownIngester } from './MarkdownIngester'; -import { - BookChunk, - DocumentSource, - ParsedSection, -} from '@cairo-coder/agents/types/index'; +import { BookChunk, DocumentSource } from '@cairo-coder/agents/types/index'; import { Document } from '@langchain/core/documents'; import { VectorStore } from '@cairo-coder/agents/db/postgresVectorStore'; import { logger } from '@cairo-coder/agents/utils/index'; +import { calculateHash } from '../utils/contentUtils'; import { - addSectionWithSizeLimit, - calculateHash, - createAnchor, -} from '../utils/contentUtils'; + RecursiveMarkdownSplitter, + SplitOptions, +} from '../utils/RecursiveMarkdownSplitter'; /** * Ingester for the Cairo Core Library documentation @@ -63,84 +59,54 @@ export class CoreLibDocsIngester extends MarkdownIngester { } /** - * Chunk the core library summary file by H1 headers + * Chunk the core library summary file using RecursiveMarkdownSplitter * - * This function takes the markdown content and splits it into sections - * based on H1 headers (# Header). Each section becomes a separate chunk - * with its content hashed for uniqueness. + * This function takes the markdown content and splits it using a recursive + * strategy that respects headers, code blocks, and maintains overlap between chunks. * * @param text - The markdown content to chunk - * @returns Promise[]> - Array of document chunks, one per H1 section + * @returns Promise[]> - Array of document chunks */ async chunkCorelibSummaryFile(text: string): Promise[]> { - const content = text; - const sections: ParsedSection[] = []; - - // Regex to match H1 headers (# Header) - const headerRegex = /^(#{1})\s+(.+)$/gm; - const matches = Array.from(content.matchAll(headerRegex)); - - let lastSectionEndIndex = 0; - - // Process each H1 header found - for (let i = 0; i < matches.length; i++) { - const match = matches[i]; - const headerTitle = match[2].trim(); - const headerStartIndex = match.index!; - - // Determine the end of this section (start of next header or end of content) - const nextHeaderIndex = - i < matches.length - 1 ? matches[i + 1].index! : content.length; - - // Extract section content from after the header to before the next header - const sectionContent = content - .slice(headerStartIndex, nextHeaderIndex) - .trim(); - - logger.debug(`Adding section: ${headerTitle}`); - - addSectionWithSizeLimit( - sections, - headerTitle, - sectionContent, - 20000, - createAnchor(headerTitle), - ); - } + logger.info( + 'Using RecursiveMarkdownSplitter to chunk Core Library documentation', + ); - // If no H1 headers found, treat the entire content as one section - if (sections.length === 0) { - logger.debug( - 'No H1 headers found, creating single section from entire content', - ); - addSectionWithSizeLimit( - sections, - 'Core Library Documentation', - content, - 20000, - createAnchor('Core Library Documentation'), - ); - } + // Configure the splitter with appropriate settings + const splitOptions: SplitOptions = { + maxChars: 2048, + minChars: 500, + overlap: 256, + headerLevels: [1, 2], // Split on H1 and H2 headers + preserveCodeBlocks: true, + idPrefix: 'corelib', + trim: true, + }; - const localChunks: Document[] = []; - - // Create a document for each section - sections.forEach((section: ParsedSection, index: number) => { - const hash: string = calculateHash(section.content); - localChunks.push( - new Document({ - pageContent: section.content, - metadata: { - name: section.title, - title: section.title, - chunkNumber: index, - contentHash: hash, - uniqueId: `${section.title}-${index}`, - sourceLink: ``, - source: this.source, - }, - }), - ); + // Create the splitter and split the content + const splitter = new RecursiveMarkdownSplitter(splitOptions); + const chunks = splitter.splitMarkdownToChunks(text); + + logger.info( + `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`, + ); + + // Convert chunks to Document format + const localChunks: Document[] = chunks.map((chunk) => { + const contentHash = calculateHash(chunk.content); + + return new Document({ + pageContent: chunk.content, + metadata: { + name: chunk.meta.title, + title: chunk.meta.title, + chunkNumber: chunk.meta.chunkNumber, // Already 0-based + contentHash: contentHash, + uniqueId: chunk.meta.uniqueId, + sourceLink: '', + source: this.source, + }, + }); }); return localChunks; diff --git a/packages/ingester/src/utils/RecursiveMarkdownSplitter.ts b/packages/ingester/src/utils/RecursiveMarkdownSplitter.ts new file mode 100644 index 00000000..56856f59 --- /dev/null +++ b/packages/ingester/src/utils/RecursiveMarkdownSplitter.ts @@ -0,0 +1,749 @@ +import { logger } from '@cairo-coder/agents/utils/index'; + +// Public API interfaces +export interface SplitOptions { + /** Maximum characters per chunk (UTF-16 .length), not counting overlap. Default: 2048 */ + maxChars?: number; + /** Minimum characters per chunk. Chunks smaller than this will be merged with adjacent chunks. Default: 500 */ + minChars?: number; + /** Characters of backward overlap between consecutive chunks. Default: 256 */ + overlap?: number; + /** Which header levels are allowed as primary split points. Default: [1, 2] */ + headerLevels?: (1 | 2)[]; + /** If true, do not split inside fenced code blocks. Default: true */ + preserveCodeBlocks?: boolean; + /** Optional prefix for generated unique IDs */ + idPrefix?: string; + /** Whether to trim whitespace around chunks. Default: true */ + trim?: boolean; +} + +export interface ChunkMeta { + /** Title derived from the last seen header among the configured levels */ + title: string; + /** Index of this chunk for the given title (0-based) */ + chunkNumber: number; + /** Globally unique ID: `${slug(title)}-${chunkNumber}` (plus idPrefix if provided) */ + uniqueId: string; + /** Inclusive start & exclusive end character offsets in the original string */ + startChar: number; + endChar: number; + /** Full header path stack (e.g., ["Intro", "Goals"]) */ + headerPath: string[]; +} + +export interface Chunk { + content: string; + meta: ChunkMeta; +} + +// Internal data structures +interface HeaderToken { + level: number; // 1..6 + text: string; + start: number; // index in original string + end: number; +} + +interface CodeBlockToken { + start: number; + end: number; + fence: '```' | '~~~'; + infoString?: string; // e.g. "ts", "python" +} + +interface Segment { + start: number; + end: number; +} + +interface Tokens { + headers: HeaderToken[]; + codeBlocks: CodeBlockToken[]; +} + +export class RecursiveMarkdownSplitter { + private readonly options: Required; + + constructor(options: SplitOptions = {}) { + this.options = { + maxChars: options.maxChars ?? 2048, + minChars: options.minChars ?? 500, + overlap: options.overlap ?? 256, + headerLevels: options.headerLevels ?? [1, 2], + preserveCodeBlocks: options.preserveCodeBlocks ?? true, + idPrefix: options.idPrefix ?? '', + trim: options.trim ?? true, + }; + + // Validate options + if (this.options.maxChars <= 0) { + throw new Error( + `maxChars must be positive, got ${this.options.maxChars}`, + ); + } + if (this.options.minChars < 0) { + throw new Error( + `minChars must be non-negative, got ${this.options.minChars}`, + ); + } + if (this.options.overlap < 0) { + throw new Error( + `overlap must be non-negative, got ${this.options.overlap}`, + ); + } + if (this.options.overlap >= this.options.maxChars) { + throw new Error( + `Overlap (${this.options.overlap}) must be less than maxChars (${this.options.maxChars})`, + ); + } + if (this.options.minChars >= this.options.maxChars) { + throw new Error( + `minChars (${this.options.minChars}) must be less than maxChars (${this.options.maxChars})`, + ); + } + if (this.options.headerLevels.length === 0) { + throw new Error('headerLevels must contain at least one level'); + } + if (this.options.headerLevels.some((level) => level < 1 || level > 6)) { + throw new Error('headerLevels must contain values between 1 and 6'); + } + } + + /** + * Main entry point to split markdown into chunks + */ + public splitMarkdownToChunks(markdown: string): Chunk[] { + // Handle empty input + if (!markdown || markdown.trim().length === 0) { + return []; + } + + // Normalize line endings + const normalizedMarkdown = markdown.replace(/\r\n/g, '\n'); + + // Tokenize the markdown + const tokens = this.tokenize(normalizedMarkdown); + + // Recursively split into segments + const rootSegment: Segment = { start: 0, end: normalizedMarkdown.length }; + const segments = this.recursivelySplit( + rootSegment, + normalizedMarkdown, + tokens, + ); + + // Merge small segments to avoid tiny chunks + const mergedSegments = this.mergeSmallSegments( + segments, + normalizedMarkdown, + tokens.codeBlocks, + ); + + // Apply overlap and assemble chunks + const rawChunks = this.assembleChunksWithOverlap( + mergedSegments, + normalizedMarkdown, + tokens.codeBlocks, + ); + + // Attach metadata + return this.attachMetadata(rawChunks, normalizedMarkdown, tokens.headers); + } + + /** + * Tokenize markdown to extract headers and code blocks + */ + private tokenize(markdown: string): Tokens { + const headers: HeaderToken[] = []; + const codeBlocks: CodeBlockToken[] = []; + + // Find all headers + const headerRegex = /^(#{1,6})\s+(.+?)(?:\s*#*)?$/gm; + let match: RegExpExecArray | null; + + while ((match = headerRegex.exec(markdown)) !== null) { + const level = match[1].length; + const text = match[2].trim(); + const start = match.index; + const end = match.index + match[0].length; + + headers.push({ level, text, start, end }); + } + + // Find all code blocks + this.findCodeBlocks(markdown, codeBlocks); + + // Filter out headers that are inside code blocks + const filteredHeaders = headers.filter((header) => { + return !codeBlocks.some( + (block) => header.start >= block.start && header.end <= block.end, + ); + }); + + return { headers: filteredHeaders, codeBlocks }; + } + + /** + * Find all fenced code blocks in the markdown + */ + private findCodeBlocks(markdown: string, codeBlocks: CodeBlockToken[]): void { + const lines = markdown.split('\n'); + let inCodeBlock = false; + let currentBlock: Partial | null = null; + let charIndex = 0; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + const fenceMatch = line.match(/^(```+|~~~+)(.*)$/); + + if (fenceMatch) { + const fence = fenceMatch[1].substring(0, 3) as '```' | '~~~'; + + if (!inCodeBlock) { + // Starting a code block + inCodeBlock = true; + currentBlock = { + start: charIndex, + fence, + infoString: fenceMatch[2].trim() || undefined, + }; + } else if (currentBlock && line.startsWith(currentBlock.fence)) { + // Ending a code block + currentBlock.end = charIndex + line.length; + codeBlocks.push(currentBlock as CodeBlockToken); + inCodeBlock = false; + currentBlock = null; + } + } + + charIndex += line.length + 1; // +1 for newline + } + + // Handle unclosed code block + if (currentBlock && inCodeBlock) { + logger.warn( + 'Unclosed code block detected, treating remaining content as plain text', + ); + } + } + + /** + * Recursively split a segment into smaller segments + */ + private recursivelySplit( + segment: Segment, + markdown: string, + tokens: Tokens, + ): Segment[] { + const segmentText = markdown.slice(segment.start, segment.end); + + // Base case: segment is within size limit + if (segmentText.length <= this.options.maxChars) { + return [segment]; + } + + // Try to split by headers + const headerSplits = this.splitByHeaders(segment, markdown, tokens); + if (headerSplits.length > 1) { + return headerSplits.flatMap((s) => + this.recursivelySplit(s, markdown, tokens), + ); + } + + // Try to split by paragraphs + const paragraphSplits = this.splitByParagraphs( + segment, + markdown, + tokens.codeBlocks, + ); + if (paragraphSplits.length > 1) { + return paragraphSplits.flatMap((s) => + this.recursivelySplit(s, markdown, tokens), + ); + } + + // Try to split by lines + const lineSplits = this.splitByLines(segment, markdown, tokens.codeBlocks); + if (lineSplits.length > 1) { + return lineSplits.flatMap((s) => + this.recursivelySplit(s, markdown, tokens), + ); + } + + // Cannot split further - return as is (may exceed maxChars) + if (segmentText.length > this.options.maxChars) { + // Check if it's a single code block + const isCodeBlock = tokens.codeBlocks.some( + (block) => block.start <= segment.start && block.end >= segment.end, + ); + if (isCodeBlock) { + logger.warn( + `Code block exceeds maxChars (${segmentText.length} > ${this.options.maxChars})`, + ); + } else { + logger.warn( + `Segment exceeds maxChars and cannot be split further (${segmentText.length} > ${this.options.maxChars})`, + ); + } + } + + return [segment]; + } + + /** + * Try to split segment by headers + */ + private splitByHeaders( + segment: Segment, + markdown: string, + tokens: Tokens, + ): Segment[] { + // Find headers within this segment that are configured split levels + const segmentHeaders = tokens.headers.filter( + (h) => + h.start >= segment.start && + h.end <= segment.end && + this.options.headerLevels.includes(h.level as 1 | 2), + ); + + if (segmentHeaders.length === 0) { + return [segment]; + } + + // Sort by position + segmentHeaders.sort((a, b) => a.start - b.start); + + const segments: Segment[] = []; + + // Handle content before first header + if (segmentHeaders[0].start > segment.start) { + segments.push({ start: segment.start, end: segmentHeaders[0].start }); + } + + // Process each header + for (let i = 0; i < segmentHeaders.length; i++) { + const header = segmentHeaders[i]; + const nextHeader = + i + 1 < segmentHeaders.length ? segmentHeaders[i + 1] : null; + + // Determine where this header's section ends + const sectionEnd = nextHeader ? nextHeader.start : segment.end; + + // Create segment starting from this header + segments.push({ start: header.start, end: sectionEnd }); + } + + // Validate: ensure complete coverage with no gaps or overlaps + if (segments.length > 0) { + // Check first segment starts at segment beginning + if (segments[0].start !== segment.start) { + logger.error( + `First segment doesn't start at segment beginning: ${segments[0].start} vs ${segment.start}`, + ); + } + + // Check last segment ends at segment end + if (segments[segments.length - 1].end !== segment.end) { + logger.error( + `Last segment doesn't end at segment end: ${segments[segments.length - 1].end} vs ${segment.end}`, + ); + } + + // Check for gaps or overlaps between consecutive segments + for (let i = 1; i < segments.length; i++) { + if (segments[i].start !== segments[i - 1].end) { + logger.error( + `Gap or overlap detected between segments: ${segments[i - 1].end} to ${segments[i].start}`, + ); + } + } + } + + return segments.length > 1 ? segments : [segment]; + } + + /** + * Try to split segment by paragraphs (double newlines) + */ + private splitByParagraphs( + segment: Segment, + markdown: string, + codeBlocks: CodeBlockToken[], + ): Segment[] { + const segmentText = markdown.slice(segment.start, segment.end); + const segments: Segment[] = []; + + // Find paragraph boundaries (double newlines) + const paragraphRegex = /\n\n+/g; + let currentStart = 0; + let match: RegExpExecArray | null; + const splitPoints: number[] = []; + + // Collect all valid split points + while ((match = paragraphRegex.exec(segmentText)) !== null) { + const splitPoint = segment.start + match.index + match[0].length; + // Check if split point is inside a code block + if (!this.isInsideCodeBlock(splitPoint, codeBlocks)) { + splitPoints.push(match.index + match[0].length); + } + } + + // Create segments based on split points + for (const splitPoint of splitPoints) { + segments.push({ + start: segment.start + currentStart, + end: segment.start + splitPoint, + }); + currentStart = splitPoint; + } + + // Add final segment if there's remaining content + if (currentStart < segmentText.length) { + segments.push({ + start: segment.start + currentStart, + end: segment.end, + }); + } + + return segments.length > 1 ? segments : [segment]; + } + + /** + * Try to split segment by lines + */ + private splitByLines( + segment: Segment, + markdown: string, + codeBlocks: CodeBlockToken[], + ): Segment[] { + const segmentText = markdown.slice(segment.start, segment.end); + const lines = segmentText.split('\n'); + const segments: Segment[] = []; + + let currentStart = segment.start; + let currentLength = 0; + let lineStart = segment.start; + + for (let i = 0; i < lines.length; i++) { + const lineLength = lines[i].length + 1; // +1 for newline + + if ( + currentLength + lineLength > this.options.maxChars && + currentLength > 0 + ) { + // Check if we can split here + if (!this.isInsideCodeBlock(lineStart, codeBlocks)) { + segments.push({ + start: currentStart, + end: lineStart, + }); + currentStart = lineStart; + currentLength = lineLength; + } else { + currentLength += lineLength; + } + } else { + currentLength += lineLength; + } + + lineStart += lineLength; + } + + // Add final segment + if (currentStart < segment.end) { + segments.push({ + start: currentStart, + end: segment.end, + }); + } + + return segments.length > 1 ? segments : [segment]; + } + + /** + * Check if a position is inside a code block + */ + private isInsideCodeBlock( + position: number, + codeBlocks: CodeBlockToken[], + ): boolean { + return codeBlocks.some( + (block) => position >= block.start && position < block.end, + ); + } + + /** + * Merge segments that are too small with adjacent segments + */ + private mergeSmallSegments( + segments: Segment[], + markdown: string, + codeBlocks: CodeBlockToken[], + ): Segment[] { + if (segments.length <= 1) return segments; + + const mergedSegments: Segment[] = []; + let currentSegment: Segment | null = null; + + for (let i = 0; i < segments.length; i++) { + const segment = segments[i]; + const segmentLength = segment.end - segment.start; + const isLastSegment = i === segments.length - 1; + + if (currentSegment === null) { + currentSegment = { ...segment }; + } else { + const currentLength = currentSegment.end - currentSegment.start; + const combinedLength = + currentSegment.end - currentSegment.start + segmentLength; + + // Determine if we should merge + const shouldMerge = + // Either segment is too small + ((segmentLength < this.options.minChars || + currentLength < this.options.minChars) && + // And merging won't exceed maxChars + combinedLength <= this.options.maxChars) || + // OR this is the last segment and it's too small + (isLastSegment && segmentLength < this.options.minChars); + + if (shouldMerge) { + // Merge by extending current segment + currentSegment.end = segment.end; + } else { + // Don't merge - push current and start new + mergedSegments.push(currentSegment); + currentSegment = { ...segment }; + } + } + } + + // Don't forget the last segment + if (currentSegment !== null) { + // Special handling for final segment if it's still too small + const currentLength = currentSegment.end - currentSegment.start; + if (currentLength < this.options.minChars && mergedSegments.length > 0) { + // Try to merge with previous segment + const lastMerged = mergedSegments[mergedSegments.length - 1]; + const combinedLength = + lastMerged.end - lastMerged.start + currentLength; + + if (combinedLength <= this.options.maxChars * 1.5) { + // Allow some flexibility for the final merge to avoid tiny final chunks + lastMerged.end = currentSegment.end; + } else { + // Can't merge without significantly exceeding limits + mergedSegments.push(currentSegment); + } + } else { + mergedSegments.push(currentSegment); + } + } + + // Final pass: ensure no segment ends in the middle of a code block + const finalSegments: Segment[] = []; + for (const segment of mergedSegments) { + let adjustedEnd = segment.end; + + // Check if segment end is inside a code block + for (const block of codeBlocks) { + if (segment.end > block.start && segment.end < block.end) { + // Extend to include the entire code block + adjustedEnd = block.end; + break; + } + } + + finalSegments.push({ + start: segment.start, + end: adjustedEnd, + }); + } + + return finalSegments; + } + + /** + * Assemble chunks with overlap handling + */ + private assembleChunksWithOverlap( + segments: Segment[], + markdown: string, + codeBlocks: CodeBlockToken[], + ): Array<{ + content: string; + start: number; + end: number; + overlapStart?: number; + }> { + if (segments.length === 0) return []; + + const chunks: Array<{ + content: string; + start: number; + end: number; + overlapStart?: number; + }> = []; + + for (let i = 0; i < segments.length; i++) { + const segment = segments[i]; + let content = markdown.slice(segment.start, segment.end); + let chunkStart = segment.start; + + // For chunks after the first, prepend overlap from previous segment + if (i > 0 && this.options.overlap > 0) { + const prevSegment = segments[i - 1]; + const prevContent = markdown.slice(prevSegment.start, prevSegment.end); + + // Calculate how much overlap to take from the previous segment + const overlapLength = Math.min( + this.options.overlap, + prevContent.length, + ); + let overlapStart = prevContent.length - overlapLength; + + // Check if the overlap would start in the middle of a code block + const overlapAbsoluteStart = prevSegment.start + overlapStart; + for (const block of codeBlocks) { + if ( + overlapAbsoluteStart > block.start && + overlapAbsoluteStart < block.end + ) { + // Overlap would start inside a code block + if (block.end <= prevSegment.end) { + // The code block ends within the previous segment + // Start overlap after the code block to avoid duplication + const blockEndInSegment = block.end - prevSegment.start; + if (blockEndInSegment < prevContent.length) { + overlapStart = blockEndInSegment; + } + } + break; + } + } + + // Extract overlap text from the adjusted position + const overlapText = prevContent.slice(overlapStart); + + // Prepend overlap to current content + content = overlapText + content; + + // Track where the actual content starts (including overlap) + chunkStart = prevSegment.start + overlapStart; + } + + chunks.push({ + content: this.options.trim ? content.trim() : content, + start: chunkStart, // Now reflects the actual start including overlap + end: segment.end, + overlapStart: i > 0 ? segment.start : undefined, // Original segment start for reference + }); + } + + return chunks; + } + + /** + * Attach metadata to chunks + */ + private attachMetadata( + rawChunks: Array<{ content: string; start: number; end: number }>, + markdown: string, + headers: HeaderToken[], + ): Chunk[] { + const chunks: Chunk[] = []; + const titleCounts = new Map(); + + for (const rawChunk of rawChunks) { + // Find the last header before or within this chunk that's in our configured levels + let title = 'ROOT'; + let headerPath: string[] = []; + + // Build full header path from all headers up to the end of this chunk + const allHeadersBeforeEnd = headers.filter((h) => h.start < rawChunk.end); + const headerStack: { level: number; text: string }[] = []; + + for (const header of allHeadersBeforeEnd) { + // Pop headers from stack that are same or lower level + while ( + headerStack.length > 0 && + headerStack[headerStack.length - 1].level >= header.level + ) { + headerStack.pop(); + } + headerStack.push({ level: header.level, text: header.text }); + } + + headerPath = headerStack.map((h) => h.text); + + // Find title from configured levels - check headers within the chunk first + const headersInChunk = headers.filter( + (h) => + h.start >= rawChunk.start && + h.start < rawChunk.end && + this.options.headerLevels.includes(h.level as 1 | 2), + ); + + if (headersInChunk.length > 0) { + // Use the first configured header within the chunk + title = headersInChunk[0].text; + } else { + // Otherwise, use the last configured header before the chunk + for (let i = headerStack.length - 1; i >= 0; i--) { + if ( + this.options.headerLevels.includes(headerStack[i].level as 1 | 2) + ) { + title = headerStack[i].text; + break; + } + } + } + + // Track chunk numbers per title (0-based) + const count = titleCounts.get(title) || 0; + titleCounts.set(title, count + 1); + + // Generate unique ID using 0-based numbering + const slug = this.slugify(title); + const uniqueId = this.options.idPrefix + ? `${this.options.idPrefix}-${slug}-${count}` + : `${slug}-${count}`; + + chunks.push({ + content: rawChunk.content, + meta: { + title, + chunkNumber: count, + uniqueId, + startChar: rawChunk.start, + endChar: rawChunk.end, + headerPath, + }, + }); + } + + return chunks; + } + + /** + * Convert a string to a slug + */ + private slugify(text: string): string { + return text + .toLowerCase() + .replace(/[^\w\s-]/g, '') // Remove non-word characters + .replace(/\s+/g, '-') // Replace spaces with hyphens + .replace(/-+/g, '-') // Replace multiple hyphens with single + .replace(/^-+|-+$/g, ''); // Remove leading/trailing hyphens + } +} + +// Export the main function as well for convenience +export function splitMarkdownToChunks( + markdown: string, + opts?: SplitOptions, +): Chunk[] { + const splitter = new RecursiveMarkdownSplitter(opts); + return splitter.splitMarkdownToChunks(markdown); +} diff --git a/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.finalChunk.test.ts b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.finalChunk.test.ts new file mode 100644 index 00000000..c4249322 --- /dev/null +++ b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.finalChunk.test.ts @@ -0,0 +1,169 @@ +import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter'; + +describe('RecursiveMarkdownSplitter - Final chunk handling', () => { + it('should deterministically handle final tiny chunks', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 50, + overlap: 10, + headerLevels: [1, 2], + trim: true, + }); + + // Text that will create a tiny final chunk + const text = `# Section One +This is the first section with enough content to meet the minimum character requirement. + +# Section Two +This is the second section with enough content to meet the minimum character requirement. + +# Section Three +Tiny bit.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Debug output + console.log( + 'Chunks:', + chunks.map((c) => ({ + title: c.meta.title, + length: c.content.length, + preview: c.content.substring(0, 30).replace(/\n/g, '\\n'), + })), + ); + + // The final tiny chunk should be merged with the previous one + const lastChunk = chunks[chunks.length - 1]; + + // Verify the tiny content was handled appropriately + const hasTinyContent = chunks.some((c) => c.content.includes('Tiny bit')); + expect(hasTinyContent).toBe(true); + + // The tiny section should not be on its own + const tinyChunk = chunks.find((c) => c.meta.title === 'Section Three'); + if (tinyChunk) { + expect(tinyChunk.content.length).toBeGreaterThanOrEqual(50); // Should meet minChars + } + }); + + it('should handle multiple tiny segments at the end', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 40, + overlap: 0, + headerLevels: [1], + trim: true, + }); + + const text = `# Main Section +This is the main section with sufficient content to be a proper chunk. + +# Tiny 1 +Small. + +# Tiny 2 +Also small. + +# Tiny 3 +Very small.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // All tiny sections should be merged together + expect(chunks.length).toBe(2); + + const lastChunk = chunks[chunks.length - 1]; + expect(lastChunk.content).toContain('Tiny 1'); + expect(lastChunk.content).toContain('Tiny 2'); + expect(lastChunk.content).toContain('Tiny 3'); + }); + + it('should not exceed maxChars significantly when merging final chunk', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 30, + overlap: 0, + headerLevels: [1], + trim: true, + }); + + const text = `# Section One +This section has exactly the right amount of content. + +# Section Two +This section also has exactly the right amount of content. + +# Tiny +End.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Check that tiny chunks are handled appropriately + const lastChunk = chunks[chunks.length - 1]; + + // If there's a tiny chunk, it should either be merged or meet minChars + if (lastChunk.meta.title === 'Tiny') { + expect(lastChunk.content.length).toBeGreaterThanOrEqual(30); + } + + // No chunk should be excessively large + chunks.forEach((chunk) => { + expect(chunk.content.length).toBeLessThanOrEqual(75); // 1.5x maxChars + }); + }); + + it('should handle edge case where all segments are tiny', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 50, + overlap: 0, + headerLevels: [1], + trim: true, + }); + + const text = `# A +Short. + +# B +Brief. + +# C +Tiny.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // All should be merged into one chunk + expect(chunks.length).toBe(1); + expect(chunks[0].content).toContain('# A'); + expect(chunks[0].content).toContain('# B'); + expect(chunks[0].content).toContain('# C'); + }); + + it('should preserve code blocks when merging final chunks', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 50, + overlap: 0, + preserveCodeBlocks: true, + trim: true, + }); + + const text = `# Section One +Content before code block. + +\`\`\`python +def hello(): + print("Hello") +\`\`\` + +# Tiny Section +End.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Code block should be preserved intact + const codeChunk = chunks.find((c) => c.content.includes('def hello()')); + expect(codeChunk).toBeDefined(); + expect(codeChunk!.content).toMatch(/```python[\s\S]*?```/); + }); +}); diff --git a/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.minChars.test.ts b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.minChars.test.ts new file mode 100644 index 00000000..a5b6578a --- /dev/null +++ b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.minChars.test.ts @@ -0,0 +1,135 @@ +import { RecursiveMarkdownSplitter } from '../RecursiveMarkdownSplitter'; + +describe('RecursiveMarkdownSplitter - minChars functionality', () => { + it('should merge segments smaller than minChars', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 200, + minChars: 100, + overlap: 0, + headerLevels: [1, 2], + }); + + const text = `# Section 1 +Short content. + +# Section 2 +Also short. + +# Section 3 +This is a bit longer content that might be closer to the minimum.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // With minChars=100, the short sections should be merged + expect(chunks.length).toBeLessThan(3); + + // All chunks should be at least minChars (except possibly the last one) + chunks.forEach((chunk, index) => { + if (index < chunks.length - 1) { + expect(chunk.content.length).toBeGreaterThanOrEqual(100); + } + }); + }); + + it('should not merge if it would exceed maxChars', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 50, + overlap: 0, + headerLevels: [1, 2], + }); + + const text = `# Section 1 +This section has exactly enough content to be close to the max limit when combined with another section. It's quite long. + +# Section 2 +This section is also substantial with a good amount of content that would exceed limits.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Should not merge if combined length would exceed maxChars significantly + // With the 1.5x flexibility for final chunks, they might merge if total < 150 chars + // Let's verify chunks are reasonably sized + chunks.forEach((chunk) => { + expect(chunk.content.length).toBeLessThanOrEqual(150); // 1.5x maxChars + }); + + // If chunks are merged, ensure it's within reasonable bounds + if (chunks.length === 1) { + expect(chunks[0].content.length).toBeLessThanOrEqual(150); + } + }); + + it('should handle the problematic formatting example', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 500, + minChars: 200, + overlap: 0, + headerLevels: [1, 2], + preserveCodeBlocks: true, + }); + + const text = `## Formatting and Debugging + +The \`core::fmt\` module provides functionality for formatting values. + +### Debug Trait + +The \`Debug\` trait is used for debug formatting. + +\`\`\`cairo +pub trait Debug +\`\`\` + +#### \`fmt\` Function + +The \`fmt\` function within the \`Debug\` trait is responsible for formatting. + +### Display Trait + +The \`Display\` trait is used for standard formatting.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Should create fewer, more substantial chunks + expect(chunks.length).toBeLessThanOrEqual(2); + + // Each chunk should be meaningful in size + chunks.forEach((chunk) => { + expect(chunk.content.length).toBeGreaterThan(100); + }); + }); + + it('should respect code block boundaries when merging', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 300, + minChars: 150, + overlap: 0, + headerLevels: [1, 2], + preserveCodeBlocks: true, + }); + + const text = `# Section 1 +Short intro. + +\`\`\`cairo +// This is a long code block +fn example() -> felt252 { + let x = 42; + let y = x * 2; + return y; +} +\`\`\` + +# Section 2 +Another short section.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Verify code blocks are not split + chunks.forEach((chunk) => { + const codeBlockMatches = chunk.content.match(/```/g) || []; + expect(codeBlockMatches.length % 2).toBe(0); + }); + }); +}); diff --git a/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.reconstruction.test.ts b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.reconstruction.test.ts new file mode 100644 index 00000000..b8be51f8 --- /dev/null +++ b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.reconstruction.test.ts @@ -0,0 +1,433 @@ +import { + RecursiveMarkdownSplitter, + SplitOptions, +} from '../RecursiveMarkdownSplitter'; + +describe('RecursiveMarkdownSplitter - Reconstruction Tests', () => { + /** + * These tests verify that when we split a document and then concatenate + * the chunks (excluding overlaps), we get back the original content. + * This ensures our splitting logic doesn't lose or duplicate content. + */ + + function reconstructFromChunks( + chunks: Array<{ + content: string; + start: number; + end: number; + overlapStart?: number; + }>, + original: string, + ): string { + if (chunks.length === 0) return ''; + + let result = ''; + let lastEnd = 0; + + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + + if (i === 0) { + // First chunk - use entire content + result = original.substring(chunk.start, chunk.end); + lastEnd = chunk.end; + } else if (chunk.overlapStart !== undefined) { + // Subsequent chunks with overlap - append only the non-overlapped portion + result += original.substring(chunk.overlapStart, chunk.end); + lastEnd = chunk.end; + } else { + // No overlap tracking - shouldn't happen but handle gracefully + result += original.substring(lastEnd, chunk.end); + lastEnd = chunk.end; + } + } + + return result; + } + + describe('Header splitting reconstruction', () => { + it('should reconstruct document with single header', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 10, + headerLevels: [1], + trim: false, // Important for exact reconstruction + }); + + const original = `# Header One +This is the first section with some content. + +More content in the first section.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + // Extract the raw chunks before metadata attachment + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with multiple headers at same level', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 60, + minChars: 0, + overlap: 15, + headerLevels: [1], + trim: false, + }); + + const original = `# First Section +Content for the first section goes here. + +# Second Section +Content for the second section goes here. + +# Third Section +Content for the third section goes here.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + // Extract raw chunks + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with nested headers', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 80, + minChars: 0, + overlap: 20, + headerLevels: [1, 2], + trim: false, + }); + + const original = `# Main Section +Introduction to the main section. + +## Subsection 1 +Details about subsection 1. + +## Subsection 2 +Details about subsection 2. + +# Another Main Section +Content for another main section.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with headers at start', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 40, + minChars: 0, + overlap: 10, + headerLevels: [1], + trim: false, + }); + + const original = `# Header at Start +Content immediately after header. + +More content here.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with content before first header', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 10, + headerLevels: [1], + trim: false, + }); + + const original = `Some preamble text before any headers. + +# First Header +Content under first header. + +# Second Header +Content under second header.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with consecutive headers', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 60, + minChars: 0, + overlap: 10, + headerLevels: [1, 2], + trim: false, + }); + + const original = `# Main Header +## Subheader 1 +## Subheader 2 +Content after headers. + +## Subheader 3 +More content.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + }); + + describe('Code block reconstruction', () => { + it('should reconstruct document with code blocks', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 60, + minChars: 0, + overlap: 15, + preserveCodeBlocks: true, + trim: false, + }); + + const original = `# Section with Code +Some text before code. + +\`\`\`python +def hello(): + print("Hello, World!") +\`\`\` + +Text after code block.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + + it('should reconstruct document with large code block', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 10, + preserveCodeBlocks: true, + trim: false, + }); + + const original = `# Code Example +Here's a large code block: + +\`\`\`javascript +// This is a large code block that exceeds maxChars +function complexFunction() { + const result = performCalculation(); + return result; +} +\`\`\` + +Text after the code.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + }); + + describe('Complex document reconstruction', () => { + it('should reconstruct a complex markdown document', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 20, + overlap: 25, + headerLevels: [1, 2], + preserveCodeBlocks: true, + trim: false, + }); + + const original = `# Cairo Programming Guide + +Welcome to the Cairo programming guide. This document covers the basics. + +## Getting Started + +To get started with Cairo, you need to understand the fundamentals. + +### Installation + +First, install the Cairo compiler: + +\`\`\`bash +curl -L https://github.com/starkware-libs/cairo/releases/download/v2.0.0/cairo-lang-2.0.0.tar.gz | tar xz +cd cairo-lang-2.0.0 +./install.sh +\`\`\` + +### Your First Program + +Here's a simple Cairo program: + +\`\`\`cairo +fn main() { + let x = 1; + let y = 2; + assert(x + y == 3, 'Math is broken!'); +} +\`\`\` + +## Advanced Topics + +Once you understand the basics, you can explore advanced features. + +### Memory Management + +Cairo uses a unique memory model based on field elements. + +### Smart Contracts + +You can write smart contracts in Cairo for StarkNet. + +## Conclusion + +Cairo is a powerful language for writing provable programs.`; + + const chunks = splitter.splitMarkdownToChunks(original); + + const rawChunks = (splitter as any).assembleChunksWithOverlap( + (splitter as any).mergeSmallSegments( + (splitter as any).recursivelySplit( + { start: 0, end: original.length }, + original, + (splitter as any).tokenize(original), + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ), + original, + (splitter as any).tokenize(original).codeBlocks, + ); + + const reconstructed = reconstructFromChunks(rawChunks, original); + expect(reconstructed).toBe(original); + }); + }); +}); diff --git a/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts new file mode 100644 index 00000000..67d93e68 --- /dev/null +++ b/packages/ingester/src/utils/__tests__/RecursiveMarkdownSplitter.test.ts @@ -0,0 +1,544 @@ +import { + RecursiveMarkdownSplitter, + SplitOptions, + Chunk, +} from '../RecursiveMarkdownSplitter'; + +describe('RecursiveMarkdownSplitter', () => { + describe('Basic functionality', () => { + it('should handle empty input', () => { + const splitter = new RecursiveMarkdownSplitter(); + expect(splitter.splitMarkdownToChunks('')).toEqual([]); + expect(splitter.splitMarkdownToChunks(' ')).toEqual([]); + }); + + it('should handle single small chunk', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + const text = 'This is a small chunk of text.'; + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks).toHaveLength(1); + expect(chunks[0].content).toBe(text); + expect(chunks[0].meta.title).toBe('ROOT'); + expect(chunks[0].meta.chunkNumber).toBe(0); + }); + + it('should throw error when overlap >= maxChars', () => { + expect(() => { + new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 100, + }); + }).toThrow('Overlap (100) must be less than maxChars (100)'); + }); + }); + + describe('Header detection and splitting', () => { + it('should split on H1 headers', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + headerLevels: [1], + }); + + const text = `# First Section +This is the first section content. + +# Second Section +This is the second section content.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Headers split the content, so we should have chunks for each section + const firstSectionChunk = chunks.find( + (c) => c.meta.title === 'First Section', + ); + const secondSectionChunk = chunks.find( + (c) => c.meta.title === 'Second Section', + ); + + expect(firstSectionChunk).toBeDefined(); + expect(secondSectionChunk).toBeDefined(); + }); + + it('should split on both H1 and H2 headers', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + headerLevels: [1, 2], + }); + + const text = `# Main Section +Some intro text. + +## Subsection 1 +First subsection. + +## Subsection 2 +Second subsection.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks.length).toBeGreaterThanOrEqual(3); + expect(chunks[0].meta.title).toBe('Main Section'); + expect(chunks.find((c) => c.meta.title === 'Subsection 1')).toBeDefined(); + expect(chunks.find((c) => c.meta.title === 'Subsection 2')).toBeDefined(); + }); + + it('should ignore headers inside code blocks', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 200, + minChars: 0, + overlap: 0, + }); + + const text = `# Real Header +Some content. + +\`\`\`markdown +# This is not a real header +It's inside a code block +\`\`\` + +More content.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks).toHaveLength(1); + expect(chunks[0].meta.title).toBe('Real Header'); + expect(chunks[0].content).toContain('# This is not a real header'); + }); + + it('should handle headers with trailing hashes', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + const text = '## Header with trailing hashes ##\nContent here.'; + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks[0].meta.title).toBe('Header with trailing hashes'); + }); + }); + + describe('Code block handling', () => { + it('should not split inside code blocks', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + preserveCodeBlocks: true, + }); + + const text = `Some text before. + +\`\`\`python +def long_function(): + # This is a long code block that exceeds maxChars + print("This should not be split") + return "Even though it's longer than 50 chars" +\`\`\` + +Some text after.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Verify code block is kept intact + const codeBlockChunk = chunks.find((c) => + c.content.includes('def long_function()'), + ); + expect(codeBlockChunk).toBeDefined(); + expect(codeBlockChunk!.content).toContain('```python'); + expect(codeBlockChunk!.content).toContain('```'); + }); + + it('should handle tilde code fences', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 200, + minChars: 0, + overlap: 20, + }); + + const text = `Text before. + +~~~javascript +const code = "This uses tilde fences"; +~~~ + +Text after.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks).toHaveLength(1); + expect(chunks[0].content).toContain('~~~javascript'); + expect(chunks[0].content).toContain( + 'const code = "This uses tilde fences"', + ); + }); + + it('should handle nested code fences correctly', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 300, + minChars: 0, + overlap: 30, + }); + + const text = `\`\`\`markdown +Example with nested fences: +\`\`\`python +print("nested") +\`\`\` +End of example +\`\`\``; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks).toHaveLength(1); + expect(chunks[0].content).toContain('Example with nested fences'); + }); + }); + + describe('Overlap handling', () => { + it('should apply backward overlap correctly', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 10, + headerLevels: [1], + }); + + const text = `# Section 1 +This is the first section with some content. + +# Section 2 +This is the second section with more content.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks.length).toBeGreaterThanOrEqual(2); + + // Check that second chunk contains overlap from first + if (chunks.length >= 2) { + // The overlap should be at the beginning of the second chunk + const overlap = 10; // We set overlap to 10 + + // Calculate expected overlap position + const firstChunkEndIndex = chunks[0].meta.endChar; + const secondChunkStartIndex = chunks[1].meta.startChar; + + // The start of second chunk should be overlap chars before the end of first chunk + expect(firstChunkEndIndex - secondChunkStartIndex).toBeLessThanOrEqual( + overlap, + ); + } + }); + + it('should extend overlap to include entire code block', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 20, + preserveCodeBlocks: true, + }); + + const text = `First part of content here. + +\`\`\` +code block content +\`\`\` + +Second part starts here and continues with more text.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // If there are multiple chunks, verify code block handling + if (chunks.length > 1) { + const codeBlockInFirst = chunks[0].content.includes('```'); + const codeBlockInSecond = chunks[1].content.includes('```'); + + // Code block should be complete in whichever chunk it appears + if (codeBlockInFirst) { + expect(chunks[0].content).toMatch(/```[\s\S]*?```/); + } + if (codeBlockInSecond) { + expect(chunks[1].content).toMatch(/```[\s\S]*?```/); + } + } + }); + }); + + describe('Metadata generation', () => { + it('should generate correct unique IDs', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 5, + idPrefix: 'test', + }); + + const text = `# My Section +This is content for the first section + +# My Section +This is content for the second section with the same title`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Find all chunks with title "My Section" + const mySectionChunks = chunks.filter( + (c) => c.meta.title === 'My Section', + ); + + // Should have at least 2 chunks with this title + expect(mySectionChunks.length).toBeGreaterThanOrEqual(2); + + // Check that they have different unique IDs with incrementing numbers + const uniqueIds = mySectionChunks.map((c) => c.meta.uniqueId); + expect(uniqueIds).toContain('test-my-section-0'); + expect(uniqueIds).toContain('test-my-section-1'); + }); + + it('should track header paths correctly', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 10, + }); + + const text = `# Chapter 1 +Intro to chapter one with some text + +## Section 1.1 +Content in section one point one + +### Subsection 1.1.1 +More content in the subsection + +## Section 1.2 +Other content in section one point two`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // This should create multiple chunks due to the smaller maxChars + expect(chunks.length).toBeGreaterThan(1); + + // Find chunks based on their unique content + const section11Chunk = chunks.find((c) => + c.content.includes('section one point one'), + ); + const subsectionChunk = chunks.find((c) => + c.content.includes('More content in the subsection'), + ); + const section12Chunk = chunks.find((c) => + c.content.includes('section one point two'), + ); + + // Check that chunks have appropriate header paths + if (section11Chunk) { + expect(section11Chunk.meta.headerPath).toContain('Chapter 1'); + // Title should be Section 1.1 since that's the header for this content + expect(section11Chunk.meta.title).toBe('Section 1.1'); + } + + if (subsectionChunk) { + expect(subsectionChunk.meta.headerPath).toContain('Chapter 1'); + // The subsection content should have appropriate headers in path + expect( + subsectionChunk.meta.headerPath.some( + (h) => h === 'Section 1.1' || h === 'Subsection 1.1.1', + ), + ).toBe(true); + } + + if (section12Chunk) { + expect(section12Chunk.meta.headerPath).toContain('Chapter 1'); + expect(section12Chunk.meta.title).toBe('Section 1.2'); + } + }); + + it('should handle chunk numbering per title', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 30, + minChars: 0, + overlap: 0, + }); + + const text = `# Long Section +This is a very long section that will definitely need to be split into multiple chunks because it exceeds our maximum character limit.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + const longSectionChunks = chunks.filter( + (c) => c.meta.title === 'Long Section', + ); + expect(longSectionChunks.length).toBeGreaterThan(1); + + // Check sequential numbering + longSectionChunks.forEach((chunk, index) => { + expect(chunk.meta.chunkNumber).toBe(index); + }); + }); + + it('should slugify titles correctly', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 2048, + minChars: 0, + overlap: 256, + }); + + const text = `# Title with Special@#$ Characters!!! +Content`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks[0].meta.uniqueId).toBe('title-with-special-characters-0'); + }); + }); + + describe('Splitting strategies', () => { + it('should fall back to paragraph splitting', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + }); + + const text = `First paragraph with some content here. + +Second paragraph with more content here. + +Third paragraph with even more content.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks.length).toBeGreaterThanOrEqual(3); + }); + + it('should fall back to line splitting for very long lines', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + }); + + // Create multiple lines that are each long but don't have paragraph breaks + const longLine = + 'Line one that is quite long and exceeds our limit\n' + + 'Line two that is also very long and exceeds limit\n' + + 'Line three with even more text to ensure splitting'; + + const chunks = splitter.splitMarkdownToChunks(longLine); + + expect(chunks.length).toBeGreaterThan(1); + }); + }); + + describe('Edge cases', () => { + it('should handle documents with no headers', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + + const text = + 'Just plain text without any headers. ' + + 'This should still be chunked properly.'; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks.every((c) => c.meta.title === 'ROOT')).toBe(true); + }); + + it('should handle consecutive headers with no content', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + + const text = `# Header 1 +# Header 2 +# Header 3 +Some content here.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + // Should produce valid chunks even with empty sections + expect(chunks.length).toBeGreaterThan(0); + chunks.forEach((chunk) => { + expect(chunk.content.length).toBeGreaterThan(0); + }); + }); + + it('should handle Windows line endings', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + + const text = '# Header\r\nContent with\r\nWindows line endings.'; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks).toHaveLength(1); + expect(chunks[0].meta.title).toBe('Header'); + expect(chunks[0].content).not.toContain('\r'); + }); + + it('should handle unclosed code blocks gracefully', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 100, + minChars: 0, + overlap: 10, + }); + + const text = `# Section +Some content. + +\`\`\`python +This code block is never closed +and continues to the end`; + + const chunks = splitter.splitMarkdownToChunks(text); + + expect(chunks.length).toBeGreaterThan(0); + // Should still produce valid output + }); + }); + + describe('Character offset tracking', () => { + it('should track start and end character positions correctly', () => { + const splitter = new RecursiveMarkdownSplitter({ + maxChars: 50, + minChars: 0, + overlap: 0, + }); + + const text = `# Section 1 +Short content. + +# Section 2 +More content here.`; + + const chunks = splitter.splitMarkdownToChunks(text); + + chunks.forEach((chunk) => { + expect(chunk.meta.startChar).toBeGreaterThanOrEqual(0); + expect(chunk.meta.endChar).toBeGreaterThan(chunk.meta.startChar); + expect( + chunk.meta.endChar - chunk.meta.startChar, + ).toBeGreaterThanOrEqual(chunk.content.length); + }); + }); + }); +}); diff --git a/python/src/cairo_coder/optimizers/mcp_optimizer.py b/python/src/cairo_coder/optimizers/mcp_optimizer.py index bb4ed581..28e9dc57 100644 --- a/python/src/cairo_coder/optimizers/mcp_optimizer.py +++ b/python/src/cairo_coder/optimizers/mcp_optimizer.py @@ -158,11 +158,12 @@ def forward(self, example, pred, trace=None): result = parallel(batches) resources_notes = [pred.resource_note for pred in result] - [pred.reasoning for pred in result] + reasonings = [pred.reasoning for pred in result] score = sum(resources_notes) / len(resources_notes) if len(resources_notes) != 0 else 0 - # for (note, reason) in zip(resources_notes, reasonings, strict=False): - # print(f"Note: {note}, reason: {reason}") + print(example.query) + for (note, reason) in zip(resources_notes, reasonings, strict=False): + print(f"Note: {note}, reason: {reason}") return score if trace is None else score >= self.threshold return (RetrievalF1,)