From 2bc0e73fd215595f51257b1b2dcd32d7564fdcd2 Mon Sep 17 00:00:00 2001 From: fastsoab Date: Mon, 1 Jun 2026 20:00:29 +0200 Subject: [PATCH 1/5] =?UTF-8?q?feat:=20rewrite=20core=20engine=20=E2=80=94?= =?UTF-8?q?=20local-first=20embeddings,=20SQLite=20store,=20symbol-aware?= =?UTF-8?q?=20chunking,=20hybrid=20retrieval?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the file-averaged, OpenAI-only, append-only POC engine with a chunk-level, incremental, hybrid-search engine: - EmbeddingProvider abstraction (fastembed local default, OpenAI opt-in, fake for tests); embedding dim comes from the provider, not a hardcoded constant. - SQLite is the source of truth (files/chunks/vectors/FTS5); FAISS is a rebuildable cache. - Pluggable vector index: exact Flat, auto-switching to IVF past a scale threshold. - Symbol-aware chunking: Python via ast, JS/TS/Go/Rust/Java via tree-sitter, line-window fallback. - Incremental indexing with content hashing and delete-before-add (fixes the duplicate/stale vector bug from the old watchdog monitor). - Hybrid dense + BM25 retrieval fused with Reciprocal Rank Fusion. Co-Authored-By: Claude Opus 4.8 (1M context) --- coderag/__init__.py | 29 ++- coderag/api.py | 155 ++++++++++++++ coderag/chunking/__init__.py | 46 +++++ coderag/chunking/base.py | 139 +++++++++++++ coderag/chunking/languages.py | 57 ++++++ coderag/chunking/python_ast.py | 47 +++++ coderag/chunking/treesitter.py | 148 ++++++++++++++ coderag/config.py | 184 ++++++++++++++--- coderag/embeddings.py | 80 -------- coderag/embeddings/__init__.py | 66 ++++++ coderag/embeddings/fake_provider.py | 46 +++++ coderag/embeddings/fastembed_provider.py | 68 +++++++ coderag/embeddings/openai_provider.py | 76 +++++++ coderag/index.py | 179 ---------------- coderag/indexer.py | 195 ++++++++++++++++++ coderag/llm.py | 70 +++++++ coderag/monitor.py | 104 ---------- coderag/retrieval/__init__.py | 1 + coderag/retrieval/fusion.py | 26 +++ coderag/retrieval/search.py | 80 ++++++++ coderag/search.py | 76 ------- coderag/store/__init__.py | 1 + coderag/store/schema.py | 69 +++++++ coderag/store/sqlite_store.py | 249 +++++++++++++++++++++++ coderag/store/vector_index.py | 183 +++++++++++++++++ coderag/types.py | 81 ++++++++ coderag/watch.py | 93 +++++++++ 27 files changed, 2079 insertions(+), 469 deletions(-) create mode 100644 coderag/api.py create mode 100644 coderag/chunking/__init__.py create mode 100644 coderag/chunking/base.py create mode 100644 coderag/chunking/languages.py create mode 100644 coderag/chunking/python_ast.py create mode 100644 coderag/chunking/treesitter.py delete mode 100644 coderag/embeddings.py create mode 100644 coderag/embeddings/__init__.py create mode 100644 coderag/embeddings/fake_provider.py create mode 100644 coderag/embeddings/fastembed_provider.py create mode 100644 coderag/embeddings/openai_provider.py delete mode 100644 coderag/index.py create mode 100644 coderag/indexer.py create mode 100644 coderag/llm.py delete mode 100644 coderag/monitor.py create mode 100644 coderag/retrieval/__init__.py create mode 100644 coderag/retrieval/fusion.py create mode 100644 coderag/retrieval/search.py delete mode 100644 coderag/search.py create mode 100644 coderag/store/__init__.py create mode 100644 coderag/store/schema.py create mode 100644 coderag/store/sqlite_store.py create mode 100644 coderag/store/vector_index.py create mode 100644 coderag/types.py create mode 100644 coderag/watch.py diff --git a/coderag/__init__.py b/coderag/__init__.py index 143f486..d89b460 100644 --- a/coderag/__init__.py +++ b/coderag/__init__.py @@ -1 +1,28 @@ -# __init__.py +"""CodeRAG: a standalone, local-first semantic code-search engine. + +Public API:: + + from coderag import CodeRAG, Config + + cr = CodeRAG(Config.from_env(watched_dir="/path/to/repo")) + cr.index() + for hit in cr.search("where is retry/backoff handled?"): + print(hit.path, hit.start_line, hit.score) +""" + +from __future__ import annotations + +from coderag.config import Config + +__version__ = "1.0.0" + +__all__ = ["CodeRAG", "Config", "__version__"] + + +def __getattr__(name: str) -> object: + # Lazy re-export so ``import coderag`` stays light (no faiss/fastembed at import). + if name == "CodeRAG": + from coderag.api import CodeRAG + + return CodeRAG + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/coderag/api.py b/coderag/api.py new file mode 100644 index 0000000..01ea882 --- /dev/null +++ b/coderag/api.py @@ -0,0 +1,155 @@ +"""The public CodeRAG facade — the one object every surface (CLI, HTTP, UI) routes through. + +Holds the wired-together engine: embedding provider, SQLite store, FAISS vector index, +indexer, and hybrid searcher. Collaborators are built lazily so constructing a ``CodeRAG`` +is cheap and importing this module pulls in no heavy dependencies. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import TYPE_CHECKING, List, Optional, Union + +from coderag.config import Config +from coderag.types import IndexStats, SearchHit + +if TYPE_CHECKING: # avoid import-time cost / cycles + from coderag.embeddings import EmbeddingProvider + from coderag.indexer import Indexer + from coderag.retrieval.search import HybridSearcher + from coderag.store.sqlite_store import SQLiteStore + from coderag.store.vector_index import FaissVectorIndex + +logger = logging.getLogger(__name__) + + +class CodeRAG: + """High-level entry point for indexing and searching a codebase.""" + + def __init__(self, config: Optional[Config] = None) -> None: + self.config = config or Config.from_env() + self._provider: Optional["EmbeddingProvider"] = None + self._store: Optional["SQLiteStore"] = None + self._vectors: Optional["FaissVectorIndex"] = None + self._indexer: Optional["Indexer"] = None + self._searcher: Optional["HybridSearcher"] = None + + # --- lazily constructed collaborators --- + + @property + def provider(self) -> "EmbeddingProvider": + if self._provider is None: + from coderag.embeddings import get_provider + + self._provider = get_provider(self.config) + return self._provider + + @property + def store(self) -> "SQLiteStore": + if self._store is None: + from coderag.store.sqlite_store import SQLiteStore + + self.config.store_dir.mkdir(parents=True, exist_ok=True) + self._store = SQLiteStore(self.config.db_path) + self._store.bootstrap(self.provider.dim, self.provider.model_id) + return self._store + + @property + def vectors(self) -> "FaissVectorIndex": + if self._vectors is None: + from coderag.store.vector_index import FaissVectorIndex + + self._vectors = FaissVectorIndex.open(self.config, self.provider.dim) + # FAISS is a rebuildable cache; reconcile with the source of truth on open. + self._vectors.ensure_consistent(self.store) + return self._vectors + + @property + def indexer(self) -> "Indexer": + if self._indexer is None: + from coderag.indexer import Indexer + + self._indexer = Indexer( + self.config, self.provider, self.store, self.vectors + ) + return self._indexer + + @property + def searcher(self) -> "HybridSearcher": + if self._searcher is None: + from coderag.retrieval.search import HybridSearcher + + self._searcher = HybridSearcher( + self.config, self.provider, self.store, self.vectors + ) + return self._searcher + + # --- public operations --- + + def index( + self, path: Optional[Union[str, Path]] = None, *, full: bool = False + ) -> IndexStats: + """Incrementally index ``path`` (defaults to the configured watched dir). + + Only files whose content hash changed are re-embedded. Pass ``full=True`` to + force a clean rebuild. + """ + target = Path(path).expanduser() if path else self.config.watched_dir + return self.indexer.index(target, full=full) + + def search(self, query: str, top_k: Optional[int] = None) -> List[SearchHit]: + """Hybrid (dense + lexical) search over the indexed codebase.""" + return self.searcher.search(query, top_k or self.config.top_k) + + def get_file( + self, + path: Union[str, Path], + start_line: Optional[int] = None, + end_line: Optional[int] = None, + ) -> str: + """Return the contents of an indexed file, optionally a 1-based line range.""" + full = (self.config.watched_dir / Path(path)).resolve() + root = self.config.watched_dir.resolve() + if root not in full.parents and full != root: + raise ValueError(f"Path escapes the indexed root: {path}") + text = full.read_text(encoding="utf-8", errors="replace") + if start_line is None and end_line is None: + return text + lines = text.splitlines() + lo = max(0, (start_line or 1) - 1) + hi = min(len(lines), end_line or len(lines)) + return "\n".join(lines[lo:hi]) + + def delete_path(self, path: Union[str, Path]) -> int: + """Forget a file that was removed from disk. Returns chunks removed.""" + root = self.config.watched_dir.resolve() + try: + rel = Path(path).resolve().relative_to(root).as_posix() + except ValueError: + return 0 + removed = self.store.delete_file(rel) + if removed: + self.vectors.remove(removed) + self.vectors.save() + return len(removed) + + def status(self) -> dict: + """Index statistics and provenance.""" + stats = self.store.stats() + return { + "provider": self.config.provider, + "model": self.provider.model_id, + "embedding_dim": self.provider.dim, + "index_type": self.vectors.kind, + "store_dir": str(self.config.store_dir), + "watched_dir": str(self.config.watched_dir), + "total_files": stats.total_files, + "total_chunks": stats.total_chunks, + "vectors": self.vectors.ntotal, + } + + def close(self) -> None: + if self._store is not None: + self._store.close() + self._store = None diff --git a/coderag/chunking/__init__.py b/coderag/chunking/__init__.py new file mode 100644 index 0000000..894f13b --- /dev/null +++ b/coderag/chunking/__init__.py @@ -0,0 +1,46 @@ +"""Chunking: turn a source file into symbol-aware, non-overlapping chunks. + +Dispatch order: +- Python -> stdlib ``ast`` symbol spans. +- JS/TS/TSX/Go/Rust/Java -> tree-sitter symbol spans. +- anything else, or any parse failure -> line-window fallback. + +A parse error never breaks indexing; it degrades gracefully to windows. +""" + +from __future__ import annotations + +import logging +from typing import List + +from coderag.chunking import base, languages +from coderag.config import Config +from coderag.types import Chunk + +logger = logging.getLogger(__name__) + + +def chunk_file(text: str, language: str, config: Config) -> List[Chunk]: + if not text.strip(): + return [] + + spans = [] + try: + if language == languages.PYTHON: + from coderag.chunking import python_ast + + spans = python_ast.extract_spans(text) + elif language in languages.TREE_SITTER_LANGUAGES: + from coderag.chunking import treesitter + + spans = treesitter.extract_spans(text, language) + except Exception as exc: # SyntaxError, tree-sitter issues, etc. + logger.debug( + "Symbol extraction failed for %s (%s); using windows.", language, exc + ) + spans = [] + + return base.build_chunks(text, language, spans, config) + + +__all__ = ["chunk_file", "languages"] diff --git a/coderag/chunking/base.py b/coderag/chunking/base.py new file mode 100644 index 0000000..4063741 --- /dev/null +++ b/coderag/chunking/base.py @@ -0,0 +1,139 @@ +"""Core chunking utilities shared by every language strategy. + +The key routine is :func:`build_chunks`: given a file's text and a flat list of symbol +spans (which may be nested, e.g. methods inside a class), it produces non-overlapping +chunks by *line ownership* — each line belongs to the smallest span that contains it, and +any line owned by no span is covered by sliding line-windows. This single algorithm serves +both the Python ``ast`` extractor and the tree-sitter extractor; they only differ in how +they discover spans. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import List, Optional, Sequence + +from coderag.config import Config +from coderag.types import Chunk + + +@dataclass +class SymbolSpan: + """A named code region discovered by a language extractor (1-based line range).""" + + symbol: str + kind: str # "function" | "class" | "method" + start_line: int + end_line: int + + +def _window( + lines: Sequence[str], + offset: int, + language: str, + config: Config, + symbol: Optional[str] = None, + kind: str = "window", +) -> List[Chunk]: + """Slide windows over ``lines`` (a contiguous block starting at 1-based ``offset``).""" + chunks: List[Chunk] = [] + step = max(1, config.window_lines - config.window_overlap) + n = len(lines) + i = 0 + while i < n: + block = lines[i : i + config.window_lines] + if "".join(block).strip(): # skip whitespace-only windows + chunks.append( + Chunk( + text="\n".join(block), + start_line=offset + i, + end_line=offset + i + len(block) - 1, + language=language, + symbol=symbol, + kind=kind, + ) + ) + if i + config.window_lines >= n: + break + i += step + return chunks + + +def _emit_span( + lines: Sequence[str], span: SymbolSpan, language: str, config: Config +) -> List[Chunk]: + """Emit one span as a chunk, splitting it into windows if it is oversized.""" + block = lines[span.start_line - 1 : span.end_line] + if not "".join(block).strip(): + return [] + if len(block) <= config.max_chunk_lines: + return [ + Chunk( + text="\n".join(block), + start_line=span.start_line, + end_line=span.end_line, + language=language, + symbol=span.symbol, + kind=span.kind, + ) + ] + return _window( + block, span.start_line, language, config, symbol=span.symbol, kind=span.kind + ) + + +def build_chunks( + text: str, language: str, spans: Sequence[SymbolSpan], config: Config +) -> List[Chunk]: + """Build non-overlapping chunks from symbol spans + windowed gaps.""" + lines = text.split("\n") + n = len(lines) + if n == 0 or not text.strip(): + return [] + + if not spans: + return _window(lines, 1, language, config) + + # Each line (1-based) is owned by the SMALLEST containing span. Assign larger spans + # first so smaller (nested) spans overwrite them. + owner: List[Optional[int]] = [None] * (n + 1) + order = sorted( + range(len(spans)), + key=lambda i: spans[i].end_line - spans[i].start_line, + reverse=True, + ) + for idx in order: + s = spans[idx] + for ln in range(max(1, s.start_line), min(n, s.end_line) + 1): + owner[ln] = idx + + chunks: List[Chunk] = [] + ln = 1 + while ln <= n: + cur = owner[ln] + start = ln + while ln + 1 <= n and owner[ln + 1] == cur: + ln += 1 + end = ln + if cur is None: + block = lines[start - 1 : end] + chunks.extend(_window(block, start, language, config)) + else: + s = spans[cur] + chunks.extend( + _emit_span( + lines, + SymbolSpan(s.symbol, s.kind, start, end), + language, + config, + ) + ) + ln += 1 + + chunks.sort(key=lambda c: c.start_line) + return chunks + + +def window_only(text: str, language: str, config: Config) -> List[Chunk]: + """Chunk a file with no symbol parser — pure line windows.""" + return build_chunks(text, language, [], config) diff --git a/coderag/chunking/languages.py b/coderag/chunking/languages.py new file mode 100644 index 0000000..b5a4c66 --- /dev/null +++ b/coderag/chunking/languages.py @@ -0,0 +1,57 @@ +"""File-extension -> language mapping and the set of languages with symbol parsers.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Optional + +# Languages for which we extract symbol-aware spans (function/class/method). +# Python uses the stdlib ``ast``; the rest use tree-sitter. +PYTHON = "python" +TREE_SITTER_LANGUAGES = {"javascript", "typescript", "tsx", "go", "rust", "java"} +SYMBOL_LANGUAGES = {PYTHON} | TREE_SITTER_LANGUAGES + +# Everything indexable. Languages not in SYMBOL_LANGUAGES are still indexed via the +# line-window fallback (so docs/config/other code remain searchable). +EXTENSION_TO_LANGUAGE = { + ".py": "python", + ".pyi": "python", + ".js": "javascript", + ".jsx": "javascript", + ".mjs": "javascript", + ".cjs": "javascript", + ".ts": "typescript", + ".tsx": "tsx", + ".go": "go", + ".rs": "rust", + ".java": "java", + # Indexed with the fallback chunker: + ".c": "c", + ".h": "c", + ".cpp": "cpp", + ".cc": "cpp", + ".hpp": "cpp", + ".cs": "csharp", + ".rb": "ruby", + ".php": "php", + ".kt": "kotlin", + ".swift": "swift", + ".scala": "scala", + ".sh": "shell", + ".bash": "shell", + ".sql": "sql", + ".md": "markdown", + ".rst": "rst", + ".txt": "text", + ".toml": "toml", + ".yaml": "yaml", + ".yml": "yaml", + ".json": "json", + ".cfg": "ini", + ".ini": "ini", +} + + +def detect_language(path: str | Path) -> Optional[str]: + """Return the language for ``path``, or ``None`` if it should not be indexed.""" + return EXTENSION_TO_LANGUAGE.get(Path(path).suffix.lower()) diff --git a/coderag/chunking/python_ast.py b/coderag/chunking/python_ast.py new file mode 100644 index 0000000..6082418 --- /dev/null +++ b/coderag/chunking/python_ast.py @@ -0,0 +1,47 @@ +"""Symbol extraction for Python using the standard library ``ast``. + +Faster and more accurate than tree-sitter for Python, with zero extra dependencies. +Emits a span per function/method and per class (the class span's non-method lines — its +docstring and class-level attributes — become their own chunks via line ownership). +""" + +from __future__ import annotations + +import ast +from typing import List, cast + +from coderag.chunking.base import SymbolSpan + + +def extract_spans(text: str) -> List[SymbolSpan]: + tree = ast.parse(text) # may raise SyntaxError -> caller falls back to windows + spans: List[SymbolSpan] = [] + + def visit(node: ast.AST, prefix: str) -> None: + for child in ast.iter_child_nodes(node): + if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)): + name = f"{prefix}{child.name}" + kind = "method" if prefix else "function" + spans.append(SymbolSpan(name, kind, _start(child), _end(child))) + # Nested functions are captured too. + visit(child, f"{name}.") + elif isinstance(child, ast.ClassDef): + name = f"{prefix}{child.name}" + spans.append(SymbolSpan(name, "class", _start(child), _end(child))) + visit(child, f"{name}.") + + visit(tree, "") + return spans + + +def _start(node: ast.AST) -> int: + # Include decorators in the span so a decorated def reads as a unit. + lines = [getattr(node, "lineno", 1)] + for dec in getattr(node, "decorator_list", []) or []: + lines.append(dec.lineno) + return min(lines) + + +def _end(node: ast.AST) -> int: + end = getattr(node, "end_lineno", None) or getattr(node, "lineno", 1) + return cast(int, end) diff --git a/coderag/chunking/treesitter.py b/coderag/chunking/treesitter.py new file mode 100644 index 0000000..3141d52 --- /dev/null +++ b/coderag/chunking/treesitter.py @@ -0,0 +1,148 @@ +"""Symbol extraction for non-Python languages via tree-sitter. + +Uses the modern ``tree_sitter`` (>=0.23) API with the official per-language grammar wheels +(``tree_sitter_python``, ``tree_sitter_go``, ...). Parsers are cached per language. Any +parse failure is the caller's cue to fall back to line windows, so a missing or broken +grammar never breaks indexing. +""" + +from __future__ import annotations + +import logging +from functools import lru_cache +from typing import Callable, List, Set + +from coderag.chunking.base import SymbolSpan + +logger = logging.getLogger(__name__) + + +def _load(module: str, fn: str = "language") -> Callable: + def loader(): + import importlib + + import tree_sitter as ts + + mod = importlib.import_module(module) + return ts.Language(getattr(mod, fn)()) + + return loader + + +# language -> callable returning a tree_sitter.Language +_LANGUAGE_LOADERS = { + "javascript": _load("tree_sitter_javascript"), + "typescript": _load("tree_sitter_typescript", "language_typescript"), + "tsx": _load("tree_sitter_typescript", "language_tsx"), + "go": _load("tree_sitter_go"), + "rust": _load("tree_sitter_rust"), + "java": _load("tree_sitter_java"), +} + +# Node types worth indexing as their own chunk, per language. +DEF_NODE_TYPES = { + "javascript": { + "function_declaration", + "generator_function_declaration", + "method_definition", + "class_declaration", + }, + "typescript": { + "function_declaration", + "generator_function_declaration", + "method_definition", + "class_declaration", + "interface_declaration", + "type_alias_declaration", + "enum_declaration", + }, + "tsx": { + "function_declaration", + "generator_function_declaration", + "method_definition", + "class_declaration", + "interface_declaration", + "type_alias_declaration", + "enum_declaration", + }, + "go": {"function_declaration", "method_declaration", "type_declaration"}, + "rust": { + "function_item", + "impl_item", + "struct_item", + "enum_item", + "trait_item", + "mod_item", + }, + "java": { + "class_declaration", + "interface_declaration", + "enum_declaration", + "method_declaration", + "constructor_declaration", + }, +} + +_NAME_FIELDS = ("name", "type") + + +@lru_cache(maxsize=16) +def _parser(language: str): + import tree_sitter as ts + + return ts.Parser(_LANGUAGE_LOADERS[language]()) + + +def _kind(node_type: str) -> str: + if "class" in node_type or "struct" in node_type or "type" in node_type: + return "class" + if "method" in node_type or "constructor" in node_type: + return "method" + return "function" + + +def _name(node, source: bytes) -> str | None: + for field in _NAME_FIELDS: + child = node.child_by_field_name(field) + if child is not None: + return source[child.start_byte : child.end_byte].decode( + "utf-8", errors="replace" + ) + # Some constructs (e.g. Go `type X struct`) nest the name in a child node; do a + # shallow breadth-first scan for the first identifier-like token. + queue = list(node.children) + seen = 0 + while queue and seen < 16: + child = queue.pop(0) + seen += 1 + if child.type.endswith("identifier"): + return source[child.start_byte : child.end_byte].decode( + "utf-8", errors="replace" + ) + queue.extend(child.children) + return None + + +def extract_spans(text: str, language: str) -> List[SymbolSpan]: + if language not in _LANGUAGE_LOADERS: + return [] + types: Set[str] = DEF_NODE_TYPES.get(language, set()) + source = text.encode("utf-8") + tree = _parser(language).parse(source) + + spans: List[SymbolSpan] = [] + stack = [tree.root_node] + while stack: + node = stack.pop() + if node.type in types: + spans.append( + SymbolSpan( + symbol=_name(node, source) or node.type, + kind=_kind(node.type), + start_line=node.start_point[0] + 1, + end_line=node.end_point[0] + 1, + ) + ) + stack.extend(reversed(node.children)) + + return spans diff --git a/coderag/config.py b/coderag/config.py index 424e7f3..b0dabcb 100644 --- a/coderag/config.py +++ b/coderag/config.py @@ -1,38 +1,164 @@ +"""Typed, injectable configuration for CodeRAG. + +The whole app reads configuration from a single immutable :class:`Config` object that +is built once (usually via :meth:`Config.from_env`) and passed down explicitly. Nothing +deep in the call stack reaches for ``os.environ`` — that keeps the engine testable and +free of import-time side effects. +""" + +from __future__ import annotations + import os +from dataclasses import dataclass, field, replace +from pathlib import Path +from typing import Tuple from dotenv import load_dotenv -# Load environment variables from the .env file -load_dotenv() +# Languages that ship with symbol-aware chunking in v1.0. Anything not listed (or that +# fails to parse) falls back to the line-window chunker. +DEFAULT_LANGUAGES: Tuple[str, ...] = ( + "python", + "javascript", + "typescript", + "tsx", + "go", + "rust", + "java", +) -# === Environment Variables === -# OpenAI API key and model settings (loaded from .env) -OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") -OPENAI_EMBEDDING_MODEL = os.getenv( - "OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002" -) # Default to ada-002 -OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4") # Default to GPT-4 +# Directories/globs never worth indexing. Note we deliberately do NOT ignore ``tests`` — +# people search their tests too. +DEFAULT_IGNORE_GLOBS: Tuple[str, ...] = ( + ".git/*", + ".hg/*", + ".svn/*", + "node_modules/*", + ".venv/*", + "venv/*", + "env/*", + "__pycache__/*", + "*.egg-info/*", + "build/*", + "dist/*", + ".mypy_cache/*", + ".pytest_cache/*", + ".coderag/*", +) -# Embedding dimension (from .env or fallback) -EMBEDDING_DIM = int(os.getenv("EMBEDDING_DIM", 1536)) # Default to 1536 if not in .env -# Project directory (from .env) -WATCHED_DIR = os.getenv("WATCHED_DIR", os.getcwd()) +def _env_str(key: str, default: str) -> str: + val = os.getenv(key) + return val if val is not None and val.strip() else default -# Path to FAISS index (from .env or fallback) -FAISS_INDEX_FILE = os.getenv( - "FAISS_INDEX_FILE", os.path.join(WATCHED_DIR, "coderag_index.faiss") -) -# === Project-Specific Configuration === -# Define the root directory of the project -ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) - -# Additional directories to ignore during indexing (these can remain static) -IGNORE_PATHS = [ - os.path.join(WATCHED_DIR, ".venv"), - os.path.join(WATCHED_DIR, "node_modules"), - os.path.join(WATCHED_DIR, "__pycache__"), - os.path.join(WATCHED_DIR, ".git"), - os.path.join(WATCHED_DIR, "tests"), -] +def _env_int(key: str, default: int) -> int: + raw = os.getenv(key) + if raw is None or not raw.strip(): + return default + try: + return int(raw) + except ValueError: + return default + + +def _env_float(key: str, default: float) -> float: + raw = os.getenv(key) + if raw is None or not raw.strip(): + return default + try: + return float(raw) + except ValueError: + return default + + +def _env_path(key: str, default: Path) -> Path: + raw = os.getenv(key) + if raw is None or not raw.strip(): + return default + return Path(raw).expanduser() + + +@dataclass(frozen=True) +class Config: + """Immutable configuration for an indexing/search session.""" + + # --- Embedding provider --- + provider: str = "fastembed" # "fastembed" | "openai" | "fake" + model: str = "BAAI/bge-small-en-v1.5" + openai_model: str = "text-embedding-3-small" + openai_api_key: str | None = None + cache_dir: Path = field(default_factory=lambda: Path.home() / ".cache" / "coderag") + + # --- Locations --- + watched_dir: Path = field(default_factory=Path.cwd) + store_dir: Path = field(default_factory=lambda: Path.cwd() / ".coderag") + + # --- What to index --- + languages: Tuple[str, ...] = DEFAULT_LANGUAGES + ignore_globs: Tuple[str, ...] = DEFAULT_IGNORE_GLOBS + max_file_bytes: int = 1_000_000 # skip files larger than this + max_chunk_lines: int = 200 # split oversized symbols into windows above this + window_lines: int = 60 # fallback line-window size + window_overlap: int = 10 + + # --- Vector index --- + index_type: str = "auto" # "auto" | "flat" | "ivf" + ivf_threshold: int = 50_000 # switch flat->ivf above this many vectors + ivf_nlist: int = 0 # 0 => derived from corpus size + ivf_nprobe: int = 16 + + # --- Retrieval --- + top_k: int = 8 + fetch_k: int = 50 # candidates pulled from each retriever before fusion + rrf_k: int = 60 + dense_weight: float = 1.0 + lexical_weight: float = 1.0 + + # --- Indexing throughput --- + embed_batch_size: int = 64 + index_workers: int = 4 + + # --- Optional LLM answer surface --- + chat_model: str = "gpt-4o-mini" + + @property + def db_path(self) -> Path: + return self.store_dir / "coderag.db" + + @property + def faiss_path(self) -> Path: + return self.store_dir / "index.faiss" + + def with_overrides(self, **kwargs: object) -> "Config": + """Return a copy with the given fields replaced (config stays immutable).""" + return replace(self, **kwargs) # type: ignore[arg-type] + + @classmethod + def from_env(cls, **overrides: object) -> "Config": + """Build a Config from environment / .env, applying explicit overrides last.""" + load_dotenv() + base = cls( + provider=_env_str("CODERAG_PROVIDER", cls.provider), + model=_env_str("CODERAG_MODEL", cls.model), + openai_model=_env_str("CODERAG_OPENAI_MODEL", cls.openai_model), + openai_api_key=os.getenv("OPENAI_API_KEY"), + cache_dir=_env_path( + "CODERAG_CACHE_DIR", Path.home() / ".cache" / "coderag" + ), + watched_dir=_env_path("CODERAG_WATCHED_DIR", Path.cwd()), + store_dir=_env_path("CODERAG_STORE_DIR", Path.cwd() / ".coderag"), + index_type=_env_str("CODERAG_INDEX_TYPE", cls.index_type), + ivf_threshold=_env_int("CODERAG_IVF_THRESHOLD", cls.ivf_threshold), + top_k=_env_int("CODERAG_TOP_K", cls.top_k), + fetch_k=_env_int("CODERAG_FETCH_K", cls.fetch_k), + rrf_k=_env_int("CODERAG_RRF_K", cls.rrf_k), + dense_weight=_env_float("CODERAG_DENSE_WEIGHT", cls.dense_weight), + lexical_weight=_env_float("CODERAG_LEXICAL_WEIGHT", cls.lexical_weight), + embed_batch_size=_env_int("CODERAG_EMBED_BATCH", cls.embed_batch_size), + index_workers=_env_int("CODERAG_WORKERS", cls.index_workers), + chat_model=_env_str("CODERAG_CHAT_MODEL", cls.chat_model), + ) + if overrides: + base = base.with_overrides(**overrides) + return base diff --git a/coderag/embeddings.py b/coderag/embeddings.py deleted file mode 100644 index 48a6b58..0000000 --- a/coderag/embeddings.py +++ /dev/null @@ -1,80 +0,0 @@ -import logging -from typing import List, Optional - -import numpy as np -from openai import OpenAI -from tenacity import retry, stop_after_attempt, wait_exponential - -from coderag.config import OPENAI_API_KEY, OPENAI_EMBEDDING_MODEL - -logger = logging.getLogger(__name__) - -# Initialize the OpenAI client with error handling -client: Optional[OpenAI] -try: - if not OPENAI_API_KEY: - raise ValueError("OpenAI API key not found in environment variables") - client = OpenAI(api_key=OPENAI_API_KEY) - logger.info(f"OpenAI client initialized with model: {OPENAI_EMBEDDING_MODEL}") -except Exception as e: - logger.error(f"Failed to initialize OpenAI client: {e}") - client = None - - -def _chunk_text(text: str, max_chars: int = 4000) -> List[str]: - """Naive chunking by characters to avoid overly long inputs.""" - text = text.strip() - if len(text) <= max_chars: - return [text] - return [text[i : i + max_chars] for i in range(0, len(text), max_chars)] - - -@retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=0.5, max=8), - reraise=True, -) -def _embed_batch(inputs: List[str]) -> np.ndarray: - """Call OpenAI embeddings with basic retry/backoff. Returns shape (n, d).""" - if client is None: - raise RuntimeError("OpenAI client not initialized") - response = client.embeddings.create( - model=OPENAI_EMBEDDING_MODEL, - input=inputs, - timeout=30, - ) - arr = np.array([d.embedding for d in response.data], dtype="float32") - return arr - - -def generate_embeddings(text: str) -> Optional[np.ndarray]: - """Generate embeddings using OpenAI's embedding API. - - Args: - text: The input text to generate embeddings for - - Returns: - numpy array of embeddings or None if generation fails - """ - if not client: - logger.error("OpenAI client not initialized") - return None - - if not text or not text.strip(): - logger.warning("Empty text provided for embedding generation") - return None - - try: - logger.debug(f"Generating embeddings for text of length: {len(text)}") - - chunks = _chunk_text(text, max_chars=4000) - vecs = _embed_batch(chunks) # shape (n, d) - - # Average chunk embeddings for a stable single vector - avg = np.mean(vecs, axis=0, dtype=np.float32).reshape(1, -1) - logger.debug(f"Successfully generated embeddings with shape: {avg.shape}") - return avg - - except Exception as e: - logger.error(f"Failed to generate embeddings: {e}") - return None diff --git a/coderag/embeddings/__init__.py b/coderag/embeddings/__init__.py new file mode 100644 index 0000000..4c3678a --- /dev/null +++ b/coderag/embeddings/__init__.py @@ -0,0 +1,66 @@ +"""Pluggable embedding providers. + +A provider turns text into L2-comparable float32 vectors. The crucial contract change +from the old CodeRAG: providers embed a *list of already-chunked texts* and return one +vector per text — there is no file-level averaging anywhere. The embedding dimension is a +property of the provider (and its model), never a hard-coded constant. +""" + +from __future__ import annotations + +from typing import Protocol, Sequence, runtime_checkable + +import numpy as np + +from coderag.config import Config + + +@runtime_checkable +class EmbeddingProvider(Protocol): + """Minimal interface every embedding backend implements.""" + + name: str + + @property + def model_id(self) -> str: + """Stable identifier of the underlying model (stored with each chunk).""" + + @property + def dim(self) -> int: + """Embedding dimensionality.""" + + def embed_documents(self, texts: Sequence[str]) -> np.ndarray: + """Embed code chunks. Returns a ``(len(texts), dim)`` float32 array.""" + + def embed_query(self, text: str) -> np.ndarray: + """Embed a single query. Returns a ``(dim,)`` float32 array.""" + + +def get_provider(config: Config) -> EmbeddingProvider: + """Construct the embedding provider named by ``config.provider``. + + Heavy backends (fastembed/openai) are imported lazily so that ``coderag --help`` and + the ``fake`` provider used in tests stay dependency-light and instant. + """ + provider = config.provider.lower() + if provider == "fake": + from coderag.embeddings.fake_provider import FakeEmbeddingProvider + + return FakeEmbeddingProvider() + if provider == "fastembed": + from coderag.embeddings.fastembed_provider import FastEmbedProvider + + return FastEmbedProvider(config.model, cache_dir=config.cache_dir) + if provider == "openai": + from coderag.embeddings.openai_provider import OpenAIEmbeddingProvider + + return OpenAIEmbeddingProvider( + model=config.openai_model, api_key=config.openai_api_key + ) + raise ValueError( + f"Unknown embedding provider {config.provider!r}. " + "Expected one of: fastembed, openai, fake." + ) + + +__all__ = ["EmbeddingProvider", "get_provider"] diff --git a/coderag/embeddings/fake_provider.py b/coderag/embeddings/fake_provider.py new file mode 100644 index 0000000..26fefdf --- /dev/null +++ b/coderag/embeddings/fake_provider.py @@ -0,0 +1,46 @@ +"""Deterministic, offline embedding provider for tests and CI. + +Maps text -> a stable pseudo-random unit vector via a hash seed. Same text always yields +the same vector, and lexically identical text collides — which is exactly what unit tests +need to assert retrieval behaviour without downloading a model or hitting the network. +""" + +from __future__ import annotations + +import hashlib +from typing import Sequence + +import numpy as np + + +class FakeEmbeddingProvider: + name = "fake" + + def __init__(self, dim: int = 16) -> None: + self._dim = dim + + @property + def model_id(self) -> str: + return f"fake-{self._dim}" + + @property + def dim(self) -> int: + return self._dim + + def _vector(self, text: str) -> np.ndarray: + digest = hashlib.sha256(text.encode("utf-8")).digest() + seed = int.from_bytes(digest[:8], "little") + rng = np.random.default_rng(seed) + vec = rng.standard_normal(self._dim).astype("float32") + norm = float(np.linalg.norm(vec)) + if norm > 0: + vec /= norm + return vec + + def embed_documents(self, texts: Sequence[str]) -> np.ndarray: + if not texts: + return np.zeros((0, self._dim), dtype="float32") + return np.vstack([self._vector(t) for t in texts]).astype("float32") + + def embed_query(self, text: str) -> np.ndarray: + return self._vector(text) diff --git a/coderag/embeddings/fastembed_provider.py b/coderag/embeddings/fastembed_provider.py new file mode 100644 index 0000000..4736c6a --- /dev/null +++ b/coderag/embeddings/fastembed_provider.py @@ -0,0 +1,68 @@ +"""Local-first embedding provider backed by fastembed (ONNX, no torch, no API key). + +This is the default backend. The model is loaded lazily on first use so that +``coderag --help``, ``status``, and any code path that doesn't actually embed stays fast +and never triggers a model download. +""" + +from __future__ import annotations + +import logging +from functools import cached_property +from pathlib import Path +from typing import Optional, Sequence + +import numpy as np + +logger = logging.getLogger(__name__) + +DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" + + +class FastEmbedProvider: + name = "fastembed" + + def __init__(self, model: str = DEFAULT_MODEL, cache_dir: Optional[Path] = None): + self._model_name = model + self._cache_dir = str(cache_dir) if cache_dir else None + self._dim = self._lookup_dim(model) + + @staticmethod + def _lookup_dim(model: str) -> Optional[int]: + try: + from fastembed import TextEmbedding + + for entry in TextEmbedding.list_supported_models(): + if entry.get("model") == model: + return int(entry["dim"]) + except Exception: # pragma: no cover - metadata lookup best-effort + pass + return None + + @cached_property + def _model(self): + from fastembed import TextEmbedding + + logger.info("Loading fastembed model %s ...", self._model_name) + return TextEmbedding(self._model_name, cache_dir=self._cache_dir) + + @property + def model_id(self) -> str: + return self._model_name + + @property + def dim(self) -> int: + if self._dim is None: + # Fall back to probing the loaded model. + self._dim = int(self._model.embedding_size) + return self._dim + + def embed_documents(self, texts: Sequence[str]) -> np.ndarray: + if not texts: + return np.zeros((0, self.dim), dtype="float32") + vecs = list(self._model.passage_embed(list(texts))) + return np.vstack(vecs).astype("float32") + + def embed_query(self, text: str) -> np.ndarray: + vec = next(iter(self._model.query_embed([text]))) + return np.asarray(vec, dtype="float32") diff --git a/coderag/embeddings/openai_provider.py b/coderag/embeddings/openai_provider.py new file mode 100644 index 0000000..8c69e43 --- /dev/null +++ b/coderag/embeddings/openai_provider.py @@ -0,0 +1,76 @@ +"""Opt-in OpenAI embedding provider. + +Unlike the old CodeRAG, this embeds each chunk independently (no file-level averaging) and +batches requests for throughput. The client and dimension are resolved lazily. +""" + +from __future__ import annotations + +import logging +from functools import cached_property +from typing import List, Optional, Sequence + +import numpy as np +from tenacity import retry, stop_after_attempt, wait_exponential + +logger = logging.getLogger(__name__) + +# Known output dimensions; anything else is probed on first use. +_KNOWN_DIMS = { + "text-embedding-3-small": 1536, + "text-embedding-3-large": 3072, + "text-embedding-ada-002": 1536, +} + +_BATCH = 128 + + +class OpenAIEmbeddingProvider: + name = "openai" + + def __init__(self, model: str, api_key: Optional[str] = None): + self._model = model + self._api_key = api_key + self._dim = _KNOWN_DIMS.get(model) + + @cached_property + def _client(self): + from openai import OpenAI + + if not self._api_key: + raise RuntimeError( + "OpenAI provider requires an API key (set OPENAI_API_KEY)." + ) + return OpenAI(api_key=self._api_key) + + @property + def model_id(self) -> str: + return self._model + + @property + def dim(self) -> int: + if self._dim is None: + self._dim = self.embed_query("probe").shape[0] + return self._dim + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=0.5, max=8), + reraise=True, + ) + def _embed_batch(self, inputs: List[str]) -> np.ndarray: + resp = self._client.embeddings.create( + model=self._model, input=inputs, timeout=30 + ) + return np.array([d.embedding for d in resp.data], dtype="float32") + + def embed_documents(self, texts: Sequence[str]) -> np.ndarray: + if not texts: + return np.zeros((0, self.dim if self._dim else 1), dtype="float32") + out: List[np.ndarray] = [] + for i in range(0, len(texts), _BATCH): + out.append(self._embed_batch(list(texts[i : i + _BATCH]))) + return np.vstack(out).astype("float32") + + def embed_query(self, text: str) -> np.ndarray: + return self._embed_batch([text])[0] diff --git a/coderag/index.py b/coderag/index.py deleted file mode 100644 index d30c5cd..0000000 --- a/coderag/index.py +++ /dev/null @@ -1,179 +0,0 @@ -import logging -import os -from typing import Any, Dict, List, Optional - -import faiss -import numpy as np - -from coderag.config import EMBEDDING_DIM, FAISS_INDEX_FILE, WATCHED_DIR - -logger = logging.getLogger(__name__) - -index = faiss.IndexFlatIP(EMBEDDING_DIM) -metadata: List[Dict[str, Any]] = [] - - -def _l2_normalize(mat: np.ndarray) -> np.ndarray: - """Normalize rows to unit length in-place, returns the same array.""" - if mat is None or mat.size == 0: - return mat - faiss.normalize_L2(mat) - return mat - - -def clear_index() -> None: - """Delete the FAISS index and metadata files if they exist, and - reinitialize the index.""" - global index, metadata - - try: - # Delete the FAISS index file - if os.path.exists(FAISS_INDEX_FILE): - os.remove(FAISS_INDEX_FILE) - logger.info(f"Deleted FAISS index file: {FAISS_INDEX_FILE}") - - # Delete the metadata file - metadata_file = "metadata.npy" - if os.path.exists(metadata_file): - os.remove(metadata_file) - logger.info(f"Deleted metadata file: {metadata_file}") - - # Reinitialize the FAISS index and metadata - index = faiss.IndexFlatIP(EMBEDDING_DIM) - metadata = [] - logger.info("FAISS index and metadata cleared and reinitialized") - - except Exception as e: - logger.error(f"Error clearing index: {str(e)}") - raise - - -def add_to_index( - embeddings: np.ndarray, full_content: str, filename: str, filepath: str -) -> None: - """Add embeddings and metadata to the FAISS index. - - Args: - embeddings: The embedding vectors to add - full_content: The original file content - filename: Name of the file - filepath: Full path to the file - """ - - try: - if embeddings is None or embeddings.size == 0: - logger.warning(f"Empty embeddings provided for {filename}") - return - - if embeddings.shape[1] != index.d: - raise ValueError( - f"Embedding dimension {embeddings.shape[1]} does not match " - f"FAISS index dimension {index.d}" - ) - - # Convert absolute filepath to relative path - try: - relative_filepath = os.path.relpath(filepath, WATCHED_DIR) - except ValueError: - logger.warning( - f"Could not create relative path for {filepath}, using " - f"absolute path" - ) - relative_filepath = filepath - - # Normalize for cosine similarity (IndexFlatIP) - vecs = embeddings.astype("float32", copy=True) - vecs = _l2_normalize(vecs) - index.add(vecs) - metadata.append( - { - # Store only a snippet to keep metadata small - "content": (full_content[:3000] if full_content else ""), - "filename": filename, - "filepath": relative_filepath, - } - ) - - logger.debug(f"Added {filename} to index (total entries: {index.ntotal})") - - except Exception as e: - logger.error(f"Error adding {filename} to index: {str(e)}") - raise - - -def save_index() -> None: - """Save the FAISS index and metadata to disk.""" - try: - faiss.write_index(index, FAISS_INDEX_FILE) - with open("metadata.npy", "wb") as f: - np.save(f, np.array(metadata, dtype=object)) - logger.debug(f"Index saved with {index.ntotal} entries") - except Exception as e: - logger.error(f"Error saving index: {str(e)}") - raise - - -def load_index() -> Optional[faiss.Index]: - """Load the FAISS index and metadata from disk. - - Returns: - The loaded FAISS index or None if loading fails - """ - global index, metadata - - try: - if not os.path.exists(FAISS_INDEX_FILE): - logger.warning(f"FAISS index file not found: {FAISS_INDEX_FILE}") - return None - - if not os.path.exists("metadata.npy"): - logger.warning("Metadata file not found: metadata.npy") - return None - - index = faiss.read_index(FAISS_INDEX_FILE) - with open("metadata.npy", "rb") as f: - metadata = np.load(f, allow_pickle=True).tolist() - - logger.info(f"Loaded index with {index.ntotal} entries") - return index - - except Exception as e: - logger.error(f"Error loading index: {str(e)}") - return None - - -def get_metadata() -> List[Dict[str, Any]]: - """Get the current metadata list. - - Returns: - List of metadata dictionaries - """ - return metadata - - -def retrieve_vectors(n=5): - n = min(n, index.ntotal) - vectors = np.zeros((n, EMBEDDING_DIM), dtype=np.float32) - for i in range(n): - vectors[i] = index.reconstruct(i) - return vectors - - -def inspect_metadata(n: int = 5) -> None: - """Print metadata information for debugging purposes. - - Args: - n: Number of entries to inspect - """ - try: - metadata_list = get_metadata() - logger.info(f"Inspecting the first {n} metadata entries:") - for i, data in enumerate(metadata_list[:n]): - logger.info(f"Entry {i}:") - logger.info(f" Filename: {data['filename']}") - logger.info(f" Filepath: {data['filepath']}") - logger.info( - f" Content: {data['content'][:100]}..." - ) # Show the first 100 characters - except Exception as e: - logger.error(f"Error inspecting metadata: {str(e)}") diff --git a/coderag/indexer.py b/coderag/indexer.py new file mode 100644 index 0000000..1013652 --- /dev/null +++ b/coderag/indexer.py @@ -0,0 +1,195 @@ +"""Incremental indexing orchestration. + +Ties chunking -> embedding -> SQLite -> FAISS together with content-hash change detection. +The critical correctness property (which the old ``monitor.py`` got wrong): a changed file's +*old* chunks are removed from both the store and the vector index **before** the new ones are +added, so re-saving a file never accumulates duplicate or stale vectors. +""" + +from __future__ import annotations + +import fnmatch +import hashlib +import logging +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Iterator, List, Optional, Tuple + +import numpy as np + +from coderag.chunking import chunk_file +from coderag.chunking.languages import detect_language +from coderag.config import Config +from coderag.embeddings import EmbeddingProvider +from coderag.store.sqlite_store import SQLiteStore +from coderag.store.vector_index import FaissVectorIndex +from coderag.types import IndexStats + +logger = logging.getLogger(__name__) + + +@dataclass +class _Work: + rel: str + language: str + text: str + content_hash: str + mtime: float + + +class Indexer: + def __init__( + self, + config: Config, + provider: EmbeddingProvider, + store: SQLiteStore, + vectors: FaissVectorIndex, + ) -> None: + self.config = config + self.provider = provider + self.store = store + self.vectors = vectors + self._ignore_dirs = { + g[:-2] + for g in config.ignore_globs + if g.endswith("/*") and "/" not in g[:-2] + } + + # --- public --- + + def index( + self, + target: Optional[Path] = None, + *, + full: bool = False, + progress: bool = False, + ) -> IndexStats: + root = self.config.watched_dir.resolve() + target = (target or self.config.watched_dir).resolve() + prune = target == root # only a full-root pass removes vanished files + + stats = IndexStats() + if full: + self._reset() + + # 1. Discover candidates and detect what actually changed (cheap hash check). + walked: set[str] = set() + work: List[_Work] = [] + for abs_path, rel, language in self._walk(target, root): + walked.add(rel) + item = self._maybe_work(abs_path, rel, language) + if item is None: + stats.files_skipped += 1 + else: + work.append(item) + + # 2. (Re)index changed files: remove old chunks, embed, add new ones. + iterator: Iterator[_Work] = iter(work) + if progress and work: + try: + from tqdm import tqdm + + iterator = tqdm(work, desc="Indexing", unit="file") + except Exception: # pragma: no cover + pass + for item in iterator: + added, removed = self._index_file(item) + stats.chunks_added += added + stats.chunks_removed += removed + stats.files_indexed += 1 + + # 3. Prune files that disappeared from disk (full-root passes only). + if prune: + for rel in set(self.store.all_file_paths()) - walked: + removed_ids = self.store.delete_file(rel) + self.vectors.remove(removed_ids) + stats.files_removed += 1 + stats.chunks_removed += len(removed_ids) + + # 4. Persist FAISS (rebuilding to IVF if we crossed the scale threshold). + if not self.vectors.maybe_upgrade(self.store): + self.vectors.save() + + final = self.store.stats() + stats.total_files = final.total_files + stats.total_chunks = final.total_chunks + return stats + + # --- internals --- + + def _reset(self) -> None: + for rel in list(self.store.all_file_paths()): + self.store.delete_file(rel) + self.vectors.rebuild_from_store(self.store) # -> empty + + def _maybe_work(self, abs_path: Path, rel: str, language: str) -> Optional[_Work]: + try: + data = abs_path.read_bytes() + except OSError as exc: + logger.warning("Cannot read %s: %s", abs_path, exc) + return None + if len(data) > self.config.max_file_bytes or not data.strip(): + return None + content_hash = hashlib.sha256(data).hexdigest() + existing = self.store.get_file(rel) + if existing is not None and existing["content_hash"] == content_hash: + return None # unchanged -> no embedding cost + text = data.decode("utf-8", errors="replace") + return _Work(rel, language, text, content_hash, abs_path.stat().st_mtime) + + def _index_file(self, item: _Work) -> Tuple[int, int]: + removed = 0 + existing = self.store.get_file(item.rel) + if existing is not None: + old_ids = self.store.delete_chunks_for_file(int(existing["id"])) + self.vectors.remove(old_ids) + removed = len(old_ids) + + file_id = self.store.upsert_file( + item.rel, item.language, item.content_hash, item.mtime + ) + + chunks = chunk_file(item.text, item.language, self.config) + if not chunks: + return 0, removed + + vectors = self.provider.embed_documents([c.text for c in chunks]) + new_ids = self.store.add_chunks( + file_id, chunks, vectors, self.provider.model_id + ) + self.vectors.add(np.array(new_ids, dtype="int64"), vectors) + return len(new_ids), removed + + def _walk(self, target: Path, root: Path) -> Iterator[Tuple[Path, str, str]]: + if target.is_file(): + rel = self._rel(target, root) + language = detect_language(target) + if rel and language and not self._ignored(rel): + yield target, rel, language + return + + for dirpath, dirnames, filenames in os.walk(target): + # prune ignored directories in place for speed + dirnames[:] = [d for d in dirnames if d not in self._ignore_dirs] + for name in filenames: + abs_path = Path(dirpath) / name + rel = self._rel(abs_path, root) + if not rel or self._ignored(rel): + continue + language = detect_language(name) + if language: + yield abs_path, rel, language + + @staticmethod + def _rel(abs_path: Path, root: Path) -> Optional[str]: + try: + return abs_path.resolve().relative_to(root).as_posix() + except ValueError: + return None + + def _ignored(self, rel: str) -> bool: + parts = rel.split("/") + if self._ignore_dirs.intersection(parts): + return True + return any(fnmatch.fnmatch(rel, g) for g in self.config.ignore_globs) diff --git a/coderag/llm.py b/coderag/llm.py new file mode 100644 index 0000000..4d0fafb --- /dev/null +++ b/coderag/llm.py @@ -0,0 +1,70 @@ +"""Optional LLM answer surface — turn retrieved chunks into a grounded, cited answer. + +This is intentionally thin and optional: CodeRAG's core value is retrieval. When an OpenAI +key is configured, ``stream_answer`` composes the top hits into a prompt and streams a +response; otherwise callers should just show the retrieved chunks. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Iterator, List + +from coderag.types import SearchHit + +if TYPE_CHECKING: + from coderag.api import CodeRAG + +logger = logging.getLogger(__name__) + +SYSTEM_PROMPT = ( + "You are a precise coding assistant. Answer the user's question using ONLY the " + "retrieved code context. Cite files as `path:line`. If the context is insufficient, " + "say so plainly rather than guessing." +) + + +def build_context(hits: List[SearchHit], max_chars: int = 8000) -> str: + blocks: List[str] = [] + used = 0 + for hit in hits: + header = f"# {hit.location}" + (f" ({hit.symbol})" if hit.symbol else "") + block = f"{header}\n```{hit.language}\n{hit.text}\n```" + if used + len(block) > max_chars: + break + blocks.append(block) + used += len(block) + return "\n\n".join(blocks) + + +def stream_answer(cr: "CodeRAG", query: str, top_k: int | None = None) -> Iterator[str]: + """Yield answer tokens. Raises RuntimeError if no OpenAI key is configured.""" + hits = cr.search(query, top_k or cr.config.top_k) + if not hits: + yield "No relevant code was found in the index for that query." + return + + api_key = cr.config.openai_api_key + if not api_key: + raise RuntimeError( + "LLM answers require an OpenAI API key (set OPENAI_API_KEY). " + "Retrieved chunks are still available without it." + ) + + from openai import OpenAI + + client = OpenAI(api_key=api_key) + context = build_context(hits) + user = f"Question: {query}\n\nRetrieved code context:\n{context}" + stream = client.chat.completions.create( + model=cr.config.chat_model, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user}, + ], + temperature=0.2, + stream=True, + ) + for part in stream: + if part.choices and part.choices[0].delta.content: + yield part.choices[0].delta.content diff --git a/coderag/monitor.py b/coderag/monitor.py deleted file mode 100644 index 7484409..0000000 --- a/coderag/monitor.py +++ /dev/null @@ -1,104 +0,0 @@ -import logging -import os -import time - -from watchdog.events import FileSystemEventHandler -from watchdog.observers import Observer - -from coderag.config import IGNORE_PATHS, WATCHED_DIR -from coderag.embeddings import generate_embeddings -from coderag.index import add_to_index, save_index - -logger = logging.getLogger(__name__) - - -def should_ignore_path(path: str) -> bool: - """Check if the given path should be ignored based on the IGNORE_PATHS list. - - Args: - path: File or directory path to check - - Returns: - True if path should be ignored, False otherwise - """ - try: - for ignore_path in IGNORE_PATHS: - if path.startswith(ignore_path): - return True - return False - except Exception as e: - logger.error(f"Error checking ignore path for {path}: {str(e)}") - return True # Err on the side of caution - - -class CodeChangeHandler(FileSystemEventHandler): - """Handle file system events for code changes.""" - - def on_modified(self, event): - """Handle file modification events.""" - try: - if event.is_directory or should_ignore_path(event.src_path): - return - - if event.src_path.endswith(".py"): - logger.info(f"Detected change in file: {event.src_path}") - - # Read file content with error handling - try: - with open(event.src_path, "r", encoding="utf-8") as f: - full_content = f.read() - except (IOError, UnicodeDecodeError) as e: - logger.error(f"Error reading file {event.src_path}: {str(e)}") - return - - # Generate embeddings - embeddings = generate_embeddings(full_content) - if embeddings is not None and embeddings.size > 0: - filename = os.path.basename(event.src_path) - try: - add_to_index(embeddings, full_content, filename, event.src_path) - save_index() - logger.info(f"Updated FAISS index for file: {event.src_path}") - except Exception as e: - logger.error( - f"Error updating index for {event.src_path}: {str(e)}" - ) - else: - logger.warning( - f"Failed to generate embeddings for {event.src_path}" - ) - - except Exception as e: - logger.error(f"Unexpected error handling file event: {str(e)}") - - -def start_monitoring() -> None: - """Start monitoring the directory for file changes.""" - try: - if not os.path.exists(WATCHED_DIR): - logger.error(f"Watched directory does not exist: {WATCHED_DIR}") - return - - event_handler = CodeChangeHandler() - observer = Observer() - observer.schedule(event_handler, path=WATCHED_DIR, recursive=True) - observer.start() - logger.info(f"Started monitoring {WATCHED_DIR} for changes...") - - try: - while True: - time.sleep(1) - except KeyboardInterrupt: - logger.info("Stopping file monitoring...") - observer.stop() - except Exception as e: - logger.error(f"Error during monitoring: {str(e)}") - observer.stop() - raise - finally: - observer.join() - logger.info("File monitoring stopped") - - except Exception as e: - logger.error(f"Failed to start monitoring: {str(e)}") - raise diff --git a/coderag/retrieval/__init__.py b/coderag/retrieval/__init__.py new file mode 100644 index 0000000..83f7f73 --- /dev/null +++ b/coderag/retrieval/__init__.py @@ -0,0 +1 @@ +"""Hybrid retrieval: dense vectors + lexical BM25, fused with Reciprocal Rank Fusion.""" diff --git a/coderag/retrieval/fusion.py b/coderag/retrieval/fusion.py new file mode 100644 index 0000000..c49fe56 --- /dev/null +++ b/coderag/retrieval/fusion.py @@ -0,0 +1,26 @@ +"""Reciprocal Rank Fusion (RRF). + +Combines several ranked id-lists into one ranking using only positions, not scores — which +makes it robust to the incomparable score scales of dense cosine and BM25. The same id +appearing in multiple lists has its contributions summed, so fusion also deduplicates. +""" + +from __future__ import annotations + +from collections import defaultdict +from typing import List, Optional, Sequence, Tuple + + +def reciprocal_rank_fusion( + ranked_lists: Sequence[Sequence[int]], + k: int = 60, + weights: Optional[Sequence[float]] = None, +) -> List[Tuple[int, float]]: + """Fuse ranked id-lists. Returns ``(id, score)`` sorted best-first.""" + if weights is None: + weights = [1.0] * len(ranked_lists) + scores: dict[int, float] = defaultdict(float) + for ranked, weight in zip(ranked_lists, weights): + for rank, item_id in enumerate(ranked): + scores[item_id] += weight * (1.0 / (k + rank + 1)) + return sorted(scores.items(), key=lambda kv: (-kv[1], kv[0])) diff --git a/coderag/retrieval/search.py b/coderag/retrieval/search.py new file mode 100644 index 0000000..bb2129f --- /dev/null +++ b/coderag/retrieval/search.py @@ -0,0 +1,80 @@ +"""Hybrid searcher: dense + lexical retrieval fused with RRF, hydrated from the store.""" + +from __future__ import annotations + +import logging +from typing import Dict, List + +from coderag.config import Config +from coderag.embeddings import EmbeddingProvider +from coderag.retrieval.fusion import reciprocal_rank_fusion +from coderag.store.sqlite_store import SQLiteStore +from coderag.store.vector_index import FaissVectorIndex +from coderag.types import SearchHit + +logger = logging.getLogger(__name__) + + +class HybridSearcher: + def __init__( + self, + config: Config, + provider: EmbeddingProvider, + store: SQLiteStore, + vectors: FaissVectorIndex, + ) -> None: + self.config = config + self.provider = provider + self.store = store + self.vectors = vectors + + def search(self, query: str, top_k: int) -> List[SearchHit]: + if not query or not query.strip(): + return [] + + fetch_k = max(self.config.fetch_k, top_k) + + # Dense retrieval. + qvec = self.provider.embed_query(query) + dense_ids, dense_scores = self.vectors.search(qvec, fetch_k) + similarity: Dict[int, float] = { + int(i): float(max(0.0, min(1.0, s))) + for i, s in zip(dense_ids, dense_scores) + } + dense_ranked = [int(i) for i in dense_ids] + + # Lexical retrieval (BM25 over FTS5). + lexical_ranked = [cid for cid, _ in self.store.fts_search(query, fetch_k)] + + # Fuse and trim. + fused = reciprocal_rank_fusion( + [dense_ranked, lexical_ranked], + k=self.config.rrf_k, + weights=[self.config.dense_weight, self.config.lexical_weight], + )[:top_k] + if not fused: + return [] + + ids = [cid for cid, _ in fused] + rows = self.store.hydrate(ids) + + hits: List[SearchHit] = [] + for cid, score in fused: + row = rows.get(cid) + if row is None: + continue + hits.append( + SearchHit( + chunk_id=cid, + path=row["path"], + symbol=row["symbol"], + kind=row["kind"], + language=row["language"], + start_line=int(row["start_line"]), + end_line=int(row["end_line"]), + text=row["text"], + score=float(score), + similarity=similarity.get(cid, 0.0), + ) + ) + return hits diff --git a/coderag/search.py b/coderag/search.py deleted file mode 100644 index ea15a76..0000000 --- a/coderag/search.py +++ /dev/null @@ -1,76 +0,0 @@ -import logging -from typing import Any, Dict, List - -import faiss - -from coderag.embeddings import generate_embeddings -from coderag.index import get_metadata, load_index - -logger = logging.getLogger(__name__) - - -def search_code(query: str, k: int = 5) -> List[Dict[str, Any]]: - """Search the FAISS index using a text query. - - Args: - query: The search query text - k: Number of results to return (default: 5) - - Returns: - List of search results with filename, filepath, content, and distance - """ - try: - if not query or not query.strip(): - logger.warning("Empty query provided") - return [] - - # Load the FAISS index - index = load_index() - if index is None: - logger.error("Failed to load FAISS index") - return [] - - if index.ntotal == 0: - logger.warning("FAISS index is empty") - return [] - - # Generate embedding for the query - query_embedding = generate_embeddings(query) - if query_embedding is None: - logger.error("Failed to generate query embedding") - return [] - # Normalize for cosine similarity (IndexFlatIP) - faiss.normalize_L2(query_embedding) - - # Perform the search in FAISS - k = min(k, index.ntotal) # Don't search for more items than exist - distances, indices = index.search(query_embedding, k) - - results = [] - metadata = get_metadata() - - for i, idx in enumerate(indices[0]): # Iterate over the search results - if 0 <= idx < len(metadata): # Ensure the index is within bounds - file_data = metadata[idx] - results.append( - { - "filename": file_data["filename"], - "filepath": file_data["filepath"], - "content": file_data["content"], - "distance": float(distances[0][i]), # Convert to Python float - } - ) - else: - logger.warning( - f"Index {idx} is out of bounds for metadata with length " - f"{len(metadata)}" - ) - - logger.debug( - f"Search returned {len(results)} results for query: " f"'{query[:50]}...'" - ) - return results - - except Exception as e: - logger.error(f"Error during code search: {str(e)}") - return [] diff --git a/coderag/store/__init__.py b/coderag/store/__init__.py new file mode 100644 index 0000000..5dc00ca --- /dev/null +++ b/coderag/store/__init__.py @@ -0,0 +1 @@ +"""Persistent storage: SQLite as the source of truth, FAISS as a rebuildable cache.""" diff --git a/coderag/store/schema.py b/coderag/store/schema.py new file mode 100644 index 0000000..6581c28 --- /dev/null +++ b/coderag/store/schema.py @@ -0,0 +1,69 @@ +"""SQLite schema for the CodeRAG store. + +Design notes: +- ``chunks.id`` IS the FAISS id. It is ``AUTOINCREMENT`` so ids are *never reused*, which + is what keeps a stale FAISS cache from resurrecting deleted content under a recycled id. +- ``chunks_fts`` is an external-content FTS5 table (no duplicated text) kept in sync by + triggers, giving us BM25 lexical search for free alongside dense vectors. +- ``files.content_hash`` drives incremental indexing; ``meta`` records the embedding + model/dim so a provider switch can trigger a rebuild instead of crashing. +""" + +from __future__ import annotations + +SCHEMA_VERSION = 1 + +DDL = """ +CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + path TEXT NOT NULL UNIQUE, + language TEXT NOT NULL, + content_hash TEXT NOT NULL, + mtime REAL, + indexed_at REAL NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_files_path ON files(path); + +CREATE TABLE IF NOT EXISTS chunks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, + symbol TEXT, + kind TEXT NOT NULL DEFAULT 'window', + start_line INTEGER NOT NULL, + end_line INTEGER NOT NULL, + language TEXT NOT NULL, + text TEXT NOT NULL, + vector BLOB NOT NULL, + embed_model TEXT NOT NULL, + created_at REAL NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_id); + +CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5( + text, + symbol, + content='chunks', + content_rowid='id', + tokenize='unicode61 remove_diacritics 2' +); + +CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN + INSERT INTO chunks_fts(rowid, text, symbol) VALUES (new.id, new.text, new.symbol); +END; + +CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN + INSERT INTO chunks_fts(chunks_fts, rowid, text, symbol) + VALUES('delete', old.id, old.text, old.symbol); +END; + +CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN + INSERT INTO chunks_fts(chunks_fts, rowid, text, symbol) + VALUES('delete', old.id, old.text, old.symbol); + INSERT INTO chunks_fts(rowid, text, symbol) VALUES (new.id, new.text, new.symbol); +END; + +CREATE TABLE IF NOT EXISTS meta ( + key TEXT PRIMARY KEY, + value TEXT +); +""" diff --git a/coderag/store/sqlite_store.py b/coderag/store/sqlite_store.py new file mode 100644 index 0000000..b881349 --- /dev/null +++ b/coderag/store/sqlite_store.py @@ -0,0 +1,249 @@ +"""SQLite-backed source of truth for files, chunks, vectors, and lexical search.""" + +from __future__ import annotations + +import logging +import re +import sqlite3 +import threading +import time +from pathlib import Path +from typing import Dict, Iterator, List, Optional, Sequence, Tuple + +import numpy as np + +from coderag.store.schema import DDL, SCHEMA_VERSION +from coderag.types import Chunk, IndexStats + +logger = logging.getLogger(__name__) + +# Strip FTS5 operators so a raw code query (e.g. ``foo::bar*``) can't raise a syntax error. +_FTS_TOKEN = re.compile(r"[A-Za-z0-9_]+") + + +def _sanitize_fts(query: str) -> str: + """Turn an arbitrary query into a safe FTS5 MATCH expression (token OR token).""" + tokens = _FTS_TOKEN.findall(query) + if not tokens: + return "" + # Quote each token (defuses operators) and OR them for recall on identifiers. + return " OR ".join(f'"{t}"' for t in tokens) + + +class SQLiteStore: + """Thread-safe store. Reads are concurrent under WAL; writes serialize on a lock.""" + + def __init__(self, db_path: Path) -> None: + self.db_path = Path(db_path) + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self._lock = threading.RLock() + self._conn = sqlite3.connect( + str(self.db_path), check_same_thread=False, isolation_level=None + ) + self._conn.row_factory = sqlite3.Row + self._conn.execute("PRAGMA journal_mode=WAL") + self._conn.execute("PRAGMA foreign_keys=ON") + self._conn.execute("PRAGMA synchronous=NORMAL") + + # --- lifecycle --- + + def bootstrap(self, embed_dim: int, embed_model: str) -> bool: + """Create schema and reconcile provenance. + + Returns True if a full rebuild is required because the embedding model/dimension + changed since the store was last written (in which case existing chunks/files are + cleared so a reindex repopulates cleanly). + """ + with self._lock: + self._conn.executescript(DDL) + self._set_meta("schema_version", str(SCHEMA_VERSION)) + prev_dim = self._get_meta("embed_dim") + prev_model = self._get_meta("embed_model") + rebuild = False + if prev_dim is not None and ( + int(prev_dim) != embed_dim or prev_model != embed_model + ): + logger.warning( + "Embedding model changed (%s/%s -> %s/%s); clearing index for " + "rebuild.", + prev_model, + prev_dim, + embed_model, + embed_dim, + ) + self._conn.execute("DELETE FROM chunks") + self._conn.execute("DELETE FROM files") + rebuild = True + self._set_meta("embed_dim", str(embed_dim)) + self._set_meta("embed_model", embed_model) + return rebuild + + def close(self) -> None: + with self._lock: + self._conn.close() + + # --- meta --- + + def _get_meta(self, key: str) -> Optional[str]: + row = self._conn.execute( + "SELECT value FROM meta WHERE key = ?", (key,) + ).fetchone() + return row["value"] if row else None + + def _set_meta(self, key: str, value: str) -> None: + self._conn.execute( + "INSERT INTO meta(key, value) VALUES(?, ?) " + "ON CONFLICT(key) DO UPDATE SET value = excluded.value", + (key, value), + ) + + # --- file records --- + + def get_file(self, path: str) -> Optional[sqlite3.Row]: + return self._conn.execute( + "SELECT * FROM files WHERE path = ?", (path,) + ).fetchone() + + def all_file_paths(self) -> List[str]: + rows = self._conn.execute("SELECT path FROM files").fetchall() + return [r["path"] for r in rows] + + def upsert_file( + self, path: str, language: str, content_hash: str, mtime: float + ) -> int: + with self._lock: + now = time.time() + self._conn.execute( + "INSERT INTO files(path, language, content_hash, mtime, indexed_at) " + "VALUES(?, ?, ?, ?, ?) " + "ON CONFLICT(path) DO UPDATE SET " + " language=excluded.language, content_hash=excluded.content_hash, " + " mtime=excluded.mtime, indexed_at=excluded.indexed_at", + (path, language, content_hash, mtime, now), + ) + row = self._conn.execute( + "SELECT id FROM files WHERE path = ?", (path,) + ).fetchone() + return int(row["id"]) + + # --- chunk records --- + + def chunk_ids_for_file(self, file_id: int) -> List[int]: + rows = self._conn.execute( + "SELECT id FROM chunks WHERE file_id = ?", (file_id,) + ).fetchall() + return [int(r["id"]) for r in rows] + + def delete_file(self, path: str) -> List[int]: + """Delete a file and its chunks. Returns the removed chunk ids (FAISS ids).""" + with self._lock: + row = self._conn.execute( + "SELECT id FROM files WHERE path = ?", (path,) + ).fetchone() + if row is None: + return [] + file_id = int(row["id"]) + ids = self.chunk_ids_for_file(file_id) + self._conn.execute("DELETE FROM chunks WHERE file_id = ?", (file_id,)) + self._conn.execute("DELETE FROM files WHERE id = ?", (file_id,)) + return ids + + def delete_chunks_for_file(self, file_id: int) -> List[int]: + with self._lock: + ids = self.chunk_ids_for_file(file_id) + self._conn.execute("DELETE FROM chunks WHERE file_id = ?", (file_id,)) + return ids + + def add_chunks( + self, + file_id: int, + chunks: Sequence[Chunk], + vectors: np.ndarray, + embed_model: str, + ) -> List[int]: + """Insert chunks with their vectors. Returns the assigned chunk ids in order.""" + if len(chunks) != len(vectors): + raise ValueError("chunks and vectors length mismatch") + ids: List[int] = [] + now = time.time() + with self._lock: + for chunk, vec in zip(chunks, vectors): + blob = np.asarray(vec, dtype="float32").tobytes() + cur = self._conn.execute( + "INSERT INTO chunks(file_id, symbol, kind, start_line, end_line, " + "language, text, vector, embed_model, created_at) " + "VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + file_id, + chunk.symbol, + chunk.kind, + chunk.start_line, + chunk.end_line, + chunk.language, + chunk.text, + blob, + embed_model, + now, + ), + ) + ids.append(int(cur.lastrowid or 0)) + return ids + + # --- retrieval support --- + + def fts_search(self, query: str, limit: int) -> List[Tuple[int, float]]: + """Lexical search via FTS5 BM25. Returns ``(chunk_id, bm25)`` best-first.""" + match = _sanitize_fts(query) + if not match: + return [] + try: + rows = self._conn.execute( + "SELECT rowid, bm25(chunks_fts) AS score FROM chunks_fts " + "WHERE chunks_fts MATCH ? ORDER BY score LIMIT ?", + (match, limit), + ).fetchall() + except sqlite3.OperationalError as exc: # pragma: no cover - defensive + logger.warning("FTS query failed (%s); degrading to dense-only.", exc) + return [] + return [(int(r["rowid"]), float(r["score"])) for r in rows] + + def hydrate(self, chunk_ids: Sequence[int]) -> Dict[int, sqlite3.Row]: + """Fetch chunk + file rows for the given ids in one query.""" + if not chunk_ids: + return {} + placeholders = ",".join("?" for _ in chunk_ids) + rows = self._conn.execute( + "SELECT c.id, c.symbol, c.kind, c.start_line, c.end_line, c.language, " + " c.text, f.path AS path " + "FROM chunks c JOIN files f ON f.id = c.file_id " + f"WHERE c.id IN ({placeholders})", + tuple(chunk_ids), + ).fetchall() + return {int(r["id"]): r for r in rows} + + def iter_vectors( + self, batch: int = 1000 + ) -> Iterator[Tuple[np.ndarray, np.ndarray]]: + """Yield ``(ids, vectors)`` batches for rebuilding the FAISS index.""" + cur = self._conn.execute("SELECT id, vector FROM chunks ORDER BY id") + while True: + rows = cur.fetchmany(batch) + if not rows: + break + ids = np.array([int(r["id"]) for r in rows], dtype="int64") + vecs = np.vstack( + [np.frombuffer(r["vector"], dtype="float32") for r in rows] + ) + yield ids, vecs + + # --- stats --- + + def stats(self) -> IndexStats: + files = self._conn.execute("SELECT COUNT(*) AS n FROM files").fetchone()["n"] + chunks = self._conn.execute("SELECT COUNT(*) AS n FROM chunks").fetchone()["n"] + return IndexStats(total_files=int(files), total_chunks=int(chunks)) + + def total_chunks(self) -> int: + return int( + self._conn.execute("SELECT COUNT(*) AS n FROM chunks").fetchone()["n"] + ) diff --git a/coderag/store/vector_index.py b/coderag/store/vector_index.py new file mode 100644 index 0000000..6e6584c --- /dev/null +++ b/coderag/store/vector_index.py @@ -0,0 +1,183 @@ +"""FAISS vector index — a rebuildable cache over the vectors stored in SQLite. + +Two backends behind one interface, selected by corpus size: +- **flat** (``IndexIDMap2(IndexFlatIP)``): exact cosine, ideal for small/medium repos. +- **ivf** (``IndexIVFFlat``): approximate, stays fast at 100k+ vectors. + +Both support ``add_with_ids`` and ``remove_ids``, so incremental indexing (delete a file's +old chunks, add the new ones) works identically regardless of backend. Because every vector +also lives in SQLite, the on-disk ``.faiss`` file is disposable and can be rebuilt at any +time (``rebuild_from_store``). +""" + +from __future__ import annotations + +import logging +import math +from pathlib import Path +from typing import TYPE_CHECKING, Tuple + +import faiss +import numpy as np + +from coderag.config import Config + +if TYPE_CHECKING: + from coderag.store.sqlite_store import SQLiteStore + +logger = logging.getLogger(__name__) + + +def _normalized(vectors: np.ndarray) -> np.ndarray: + """Return an L2-normalized float32 copy (cosine similarity via inner product).""" + mat = np.ascontiguousarray(vectors, dtype="float32") + if mat.size: + mat = mat.copy() + faiss.normalize_L2(mat) + return mat + + +def _derive_nlist(n: int, configured: int) -> int: + if configured > 0: + return max(1, min(configured, n)) + return max(1, min(int(4 * math.sqrt(n)), max(1, n // 39))) + + +class FaissVectorIndex: + def __init__(self, index: faiss.Index, kind: str, config: Config, dim: int) -> None: + self._index = index + self.kind = kind + self.config = config + self.dim = dim + + # --- construction / persistence --- + + @classmethod + def _empty_flat(cls, dim: int) -> faiss.Index: + return faiss.IndexIDMap2(faiss.IndexFlatIP(dim)) + + @classmethod + def open(cls, config: Config, dim: int) -> "FaissVectorIndex": + path = config.faiss_path + meta_path = Path(str(path) + ".kind") + if path.exists() and meta_path.exists(): + try: + index = faiss.read_index(str(path)) + kind = meta_path.read_text().strip() or "flat" + if kind == "ivf": + index.nprobe = config.ivf_nprobe + return cls(index, kind, config, dim) + except Exception as exc: # pragma: no cover - corrupt cache + logger.warning("Failed to load FAISS index (%s); starting empty.", exc) + return cls(cls._empty_flat(dim), "flat", config, dim) + + def save(self) -> None: + path = self.config.faiss_path + path.parent.mkdir(parents=True, exist_ok=True) + faiss.write_index(self._index, str(path)) + Path(str(path) + ".kind").write_text(self.kind) + + # --- properties --- + + @property + def ntotal(self) -> int: + return int(self._index.ntotal) + + # --- mutations --- + + def add(self, ids: np.ndarray, vectors: np.ndarray) -> None: + if len(ids) == 0: + return + vecs = _normalized(vectors) + id_arr = np.ascontiguousarray(ids, dtype="int64") + self._index.add_with_ids(vecs, id_arr) + + def remove(self, ids) -> int: + ids = list(ids) + if not ids: + return 0 + selector = faiss.IDSelectorBatch(np.asarray(ids, dtype="int64")) + return int(self._index.remove_ids(selector)) + + def search(self, query: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]: + """Return ``(ids, scores)`` for the top-k, with FAISS ``-1`` padding stripped.""" + if self.ntotal == 0: + return np.empty(0, dtype="int64"), np.empty(0, dtype="float32") + q = _normalized(np.asarray(query, dtype="float32").reshape(1, -1)) + k = min(k, self.ntotal) + scores, ids = self._index.search(q, k) + ids_row, scores_row = ids[0], scores[0] + mask = ids_row != -1 + return ids_row[mask].astype("int64"), scores_row[mask].astype("float32") + + # --- rebuild / consistency --- + + def _choose_kind(self, n: int) -> str: + if self.config.index_type == "flat": + return "flat" + if self.config.index_type == "ivf": + return "ivf" if n > 0 else "flat" + # auto + return "ivf" if n > self.config.ivf_threshold else "flat" + + def _build_ivf(self, ids: np.ndarray, vecs: np.ndarray) -> faiss.Index: + nlist = _derive_nlist(len(ids), self.config.ivf_nlist) + quantizer = faiss.IndexFlatIP(self.dim) + index = faiss.IndexIVFFlat( + quantizer, self.dim, nlist, faiss.METRIC_INNER_PRODUCT + ) + index.train(vecs) + index.add_with_ids(vecs, ids) + index.nprobe = self.config.ivf_nprobe + logger.info("Built IVF index: %d vectors, nlist=%d", len(ids), nlist) + return index + + def rebuild_from_store(self, store: "SQLiteStore") -> None: + """Discard the current index and rebuild it from the SQLite vectors.""" + n = store.total_chunks() + kind = self._choose_kind(n) + if n == 0: + self._index = self._empty_flat(self.dim) + self.kind = "flat" + self.save() + return + + if kind == "ivf": + # IVF needs all training vectors up front. + all_ids, all_vecs = [], [] + for ids, vecs in store.iter_vectors(): + all_ids.append(ids) + all_vecs.append(_normalized(vecs)) + ids = np.concatenate(all_ids) + vecs = np.vstack(all_vecs) + self._index = self._build_ivf(ids, vecs) + self.kind = "ivf" + else: + index = self._empty_flat(self.dim) + for ids, vecs in store.iter_vectors(): + index.add_with_ids(_normalized(vecs), np.ascontiguousarray(ids)) + self._index = index + self.kind = "flat" + logger.info("Built flat index: %d vectors", n) + self.save() + + def ensure_consistent(self, store: "SQLiteStore") -> None: + """Rebuild from SQLite if the cached vector count disagrees with the store.""" + if self.ntotal != store.total_chunks(): + logger.info( + "FAISS cache out of sync (%d vs %d chunks); rebuilding.", + self.ntotal, + store.total_chunks(), + ) + self.rebuild_from_store(store) + + def maybe_upgrade(self, store: "SQLiteStore") -> bool: + """Switch flat->ivf when an auto index grows past the threshold. Returns True + if a rebuild happened.""" + if self.config.index_type != "auto" or self.kind == "ivf": + return False + if store.total_chunks() > self.config.ivf_threshold: + logger.info("Corpus exceeded IVF threshold; upgrading flat -> ivf.") + self.rebuild_from_store(store) + return True + return False diff --git a/coderag/types.py b/coderag/types.py new file mode 100644 index 0000000..5fc8b0a --- /dev/null +++ b/coderag/types.py @@ -0,0 +1,81 @@ +"""Shared data types used across chunking, storage, retrieval, and the public API.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Dict, Optional + + +@dataclass +class Chunk: + """A unit of indexed code — usually a function/class/method, or a line window.""" + + text: str + start_line: int # 1-based, inclusive + end_line: int # 1-based, inclusive + language: str + symbol: Optional[str] = None # qualified name, e.g. "ClassName.method" + kind: str = "window" # "function" | "class" | "method" | "window" + + @property + def line_count(self) -> int: + return self.end_line - self.start_line + 1 + + +@dataclass +class SearchHit: + """A retrieval result, hydrated from the store.""" + + chunk_id: int + path: str + symbol: Optional[str] + kind: str + language: str + start_line: int + end_line: int + text: str + score: float # fused (RRF) score — relative ranking signal + similarity: float # raw cosine similarity in [0, 1] for display + + @property + def location(self) -> str: + return f"{self.path}:{self.start_line}" + + def as_dict(self) -> Dict[str, Any]: + return { + "chunk_id": self.chunk_id, + "path": self.path, + "symbol": self.symbol, + "kind": self.kind, + "language": self.language, + "start_line": self.start_line, + "end_line": self.end_line, + "text": self.text, + "score": self.score, + "similarity": self.similarity, + "location": self.location, + } + + +@dataclass +class IndexStats: + """Summary of an indexing run or the current index state.""" + + files_indexed: int = 0 + files_skipped: int = 0 + files_removed: int = 0 + chunks_added: int = 0 + chunks_removed: int = 0 + total_files: int = 0 + total_chunks: int = 0 + + def as_dict(self) -> Dict[str, Any]: + return { + "files_indexed": self.files_indexed, + "files_skipped": self.files_skipped, + "files_removed": self.files_removed, + "chunks_added": self.chunks_added, + "chunks_removed": self.chunks_removed, + "total_files": self.total_files, + "total_chunks": self.total_chunks, + } diff --git a/coderag/watch.py b/coderag/watch.py new file mode 100644 index 0000000..41e9104 --- /dev/null +++ b/coderag/watch.py @@ -0,0 +1,93 @@ +"""Debounced filesystem watcher that keeps the index live as files change. + +Replaces the old ``monitor.py``. Two important differences: events are *debounced* (editors +fire several writes per save) and each flushed path is re-hashed by the indexer, so an +unchanged file costs nothing and a changed file is updated without duplicating vectors. +""" + +from __future__ import annotations + +import logging +import threading +import time +from pathlib import Path +from typing import TYPE_CHECKING, Set + +from watchdog.events import FileSystemEventHandler +from watchdog.observers import Observer + +from coderag.chunking.languages import detect_language + +if TYPE_CHECKING: + from coderag.api import CodeRAG + +logger = logging.getLogger(__name__) + + +class _Handler(FileSystemEventHandler): + def __init__(self, pending: Set[str], lock: threading.Lock) -> None: + self._pending = pending + self._lock = lock + + def _note(self, path: str) -> None: + if path and detect_language(path): + with self._lock: + self._pending.add(path) + + def on_modified(self, event): + if not event.is_directory: + self._note(event.src_path) + + def on_created(self, event): + if not event.is_directory: + self._note(event.src_path) + + def on_deleted(self, event): + if not event.is_directory: + self._note(event.src_path) + + def on_moved(self, event): + if not event.is_directory: + self._note(event.src_path) + self._note(getattr(event, "dest_path", "")) + + +def watch(cr: "CodeRAG", debounce: float = 0.5) -> None: + """Block, keeping ``cr``'s index in sync with its watched directory until Ctrl-C.""" + root = cr.config.watched_dir + pending: Set[str] = set() + lock = threading.Lock() + handler = _Handler(pending, lock) + observer = Observer() + observer.schedule(handler, str(root), recursive=True) + observer.start() + logger.info("Watching %s for changes (Ctrl-C to stop)...", root) + + try: + while True: + time.sleep(debounce) + with lock: + batch = set(pending) + pending.clear() + for raw in batch: + _apply(cr, raw) + except KeyboardInterrupt: + logger.info("Stopping watcher...") + finally: + observer.stop() + observer.join() + + +def _apply(cr: "CodeRAG", raw: str) -> None: + path = Path(raw) + try: + if path.exists(): + stats = cr.index(path) + if stats.files_indexed: + logger.info("Reindexed %s (+%d chunks)", raw, stats.chunks_added) + else: + removed = cr.delete_path(path) + if removed: + logger.info("Removed %s (-%d chunks)", raw, removed) + except Exception as exc: # pragma: no cover - defensive, keep the loop alive + logger.error("Failed to process %s: %s", raw, exc) From 23336d69385f167855c70f41702a25896a6883eb Mon Sep 17 00:00:00 2001 From: fastsoab Date: Mon, 1 Jun 2026 20:00:37 +0200 Subject: [PATCH 2/5] =?UTF-8?q?feat:=20add=20standalone=20surfaces=20?= =?UTF-8?q?=E2=80=94=20CLI,=20HTTP/REST=20API,=20and=20Streamlit=20UI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All four surfaces are thin adapters over the CodeRAG facade: - coderag CLI: index / search / watch / serve / ui / status (entry point 'coderag'). - FastAPI server (coderag serve): GET /search /status /file, POST /index ([server] extra). - Streamlit UI (coderag ui): streamed answers, file:line citations, scores, reindex ([ui] extra). Removes the old main.py / app.py / prompt_flow.py / cli.py / scripts entry points. Co-Authored-By: Claude Opus 4.8 (1M context) --- app.py | 169 ----------------------- coderag/cli.py | 56 -------- coderag/surfaces/__init__.py | 1 + coderag/surfaces/cli.py | 216 ++++++++++++++++++++++++++++++ coderag/surfaces/http_api.py | 74 ++++++++++ coderag/surfaces/streamlit_app.py | 95 +++++++++++++ main.py | 131 ------------------ prompt_flow.py | 123 ----------------- scripts/initialize_index.py | 10 -- scripts/run_monitor.py | 4 - 10 files changed, 386 insertions(+), 493 deletions(-) delete mode 100644 app.py delete mode 100644 coderag/cli.py create mode 100644 coderag/surfaces/__init__.py create mode 100644 coderag/surfaces/cli.py create mode 100644 coderag/surfaces/http_api.py create mode 100644 coderag/surfaces/streamlit_app.py delete mode 100644 main.py delete mode 100644 prompt_flow.py delete mode 100644 scripts/initialize_index.py delete mode 100644 scripts/run_monitor.py diff --git a/app.py b/app.py deleted file mode 100644 index ee7bec8..0000000 --- a/app.py +++ /dev/null @@ -1,169 +0,0 @@ -import logging -from typing import Optional as _Optional - -import streamlit as st -from openai import OpenAI - -from coderag.config import OPENAI_API_KEY -from prompt_flow import execute_rag_flow - -# Configure logging for Streamlit -# Use force=True to ensure Streamlit's default handlers don't suppress ours -logging.basicConfig(level=logging.INFO, force=True) -logger = logging.getLogger(__name__) - -# Initialize the OpenAI client with error handling -client: _Optional[OpenAI] -try: - if OPENAI_API_KEY: - client = OpenAI(api_key=OPENAI_API_KEY) - logger.info("OpenAI client initialized successfully") - else: - client = None - logger.error("OpenAI API key not found") -except Exception as e: - client = None - logger.error(f"Failed to initialize OpenAI client: {e}") - -# Set page config -st.set_page_config( - page_title="CodeRAG: Your Coding Assistant", page_icon="🤖", layout="wide" -) - -st.title("🤖 CodeRAG: Your Coding Assistant") -st.markdown("*AI-powered code retrieval and assistance using RAG technology*") - -# Initialize session state -if "messages" not in st.session_state: - st.session_state.messages = [] -if "conversation_context" not in st.session_state: - st.session_state.conversation_context = [] - -# Sidebar with controls -with st.sidebar: - st.header("Controls") - - if st.button("🗑️ Clear Conversation", type="secondary"): - st.session_state.messages = [] - st.session_state.conversation_context = [] - st.rerun() - - # Status indicators - st.header("Status") - if client: - st.success("✅ OpenAI Connected") - else: - st.error("❌ OpenAI Not Connected") - st.error("Please check your API key in .env file") - - # Conversation stats - if st.session_state.messages: - st.info(f"💬 {len(st.session_state.messages)} messages in conversation") - -# Display chat history with improved formatting -for message in st.session_state.messages: - with st.chat_message(message["role"]): - if message["role"] == "assistant" and "error" in message["content"].lower(): - st.error(message["content"]) - else: - st.markdown(message["content"]) - -# Chat input with validation -if not client: - st.warning( - "⚠️ OpenAI client not available. Please configure your API key to use " - "the assistant." - ) - st.stop() - -if prompt := st.chat_input("What is your coding question?", disabled=not client): - # Validate input - if not prompt.strip(): - st.warning("Please enter a valid question.") - st.stop() - - # Add user message - st.session_state.messages.append({"role": "user", "content": prompt}) - # Add to conversation context for better continuity - st.session_state.conversation_context.append(f"User: {prompt}") - - with st.chat_message("user"): - st.markdown(prompt) - - with st.chat_message("assistant"): - message_placeholder = st.empty() - - # Show loading indicator - with st.spinner("🔍 Searching codebase and generating response..."): - try: - # Execute RAG flow with error handling - response = execute_rag_flow(prompt) - - # Check if response indicates an error - if ( - response.startswith("Error:") - or "error occurred" in response.lower() - ): - message_placeholder.error(response) - else: - message_placeholder.markdown(response) - - full_response = response - - except Exception as e: - error_message = f"Unexpected error: {str(e)}" - logger.error(f"Streamlit error: {error_message}") - message_placeholder.error(error_message) - full_response = error_message - - # Add assistant response to session - st.session_state.messages.append( - {"role": "assistant", "content": full_response} - ) - # Add to conversation context - st.session_state.conversation_context.append( - f"Assistant: {full_response[:200]}..." - ) # Truncate for context - - # Keep conversation context manageable (last 10 exchanges) - if len(st.session_state.conversation_context) > 20: - st.session_state.conversation_context = ( - st.session_state.conversation_context[-20:] - ) - -# Footer with helpful information -if not st.session_state.messages: - st.markdown("---") - st.markdown("### 💡 Tips for better results:") - st.markdown( - """ - - Ask specific questions about your code - - Mention file names or functions you're interested in - - Request explanations, improvements, or debugging help - - Ask about code patterns or best practices - """ - ) - - st.markdown("### 🚀 Example queries:") - col1, col2 = st.columns(2) - with col1: - if st.button("📝 Explain the indexing process"): - st.session_state.messages.append( - { - "role": "user", - "content": "Explain how the FAISS indexing works in this codebase", - } - ) - st.rerun() - with col2: - if st.button("🐛 Help debug search issues"): - st.session_state.messages.append( - { - "role": "user", - "content": ( - "How can I debug issues with code search not returning " - "results?" - ), - } - ) - st.rerun() diff --git a/coderag/cli.py b/coderag/cli.py deleted file mode 100644 index d33dd79..0000000 --- a/coderag/cli.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Minimal command-line interface for querying an existing CodeRAG index.""" - -import argparse -import logging -import textwrap -from typing import List - -from coderag.search import search_code - - -def _format_result(result: dict, index: int) -> str: - snippet = textwrap.shorten( - result.get("content", "").replace("\n", " "), width=200, placeholder="..." - ) - return ( - f"{index}. {result.get('filename')} ({result.get('filepath')})\n" - f" similarity={result.get('distance', 0.0):.3f}\n" - f" {snippet}" - ) - - -def main(argv: List[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description="Query a local CodeRAG FAISS index without the Streamlit UI." - ) - parser.add_argument("query", help="Text to search for in the indexed codebase.") - parser.add_argument( - "-k", - type=int, - default=5, - help="Maximum number of matches to display (defaults to 5).", - ) - parser.add_argument( - "--log-level", - default="WARNING", - choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], - help="Logging verbosity for debugging issues.", - ) - - args = parser.parse_args(argv) - - logging.basicConfig(level=getattr(logging, args.log_level)) - - results = search_code(args.query, k=args.k) - if not results: - print("No results found; ensure the FAISS index exists and contains data.") - return 1 - - for idx, item in enumerate(results, start=1): - print(_format_result(item, idx)) - - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/coderag/surfaces/__init__.py b/coderag/surfaces/__init__.py new file mode 100644 index 0000000..dfa6107 --- /dev/null +++ b/coderag/surfaces/__init__.py @@ -0,0 +1 @@ +"""User-facing surfaces: CLI, HTTP server, and Streamlit UI — all thin over the facade.""" diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py new file mode 100644 index 0000000..9553c72 --- /dev/null +++ b/coderag/surfaces/cli.py @@ -0,0 +1,216 @@ +"""The ``coderag`` command — index, search, watch, serve, ui, status. + +Every subcommand is a thin adapter over :class:`coderag.api.CodeRAG`. +""" + +from __future__ import annotations + +import argparse +import json +import logging +import sys +import textwrap +from pathlib import Path +from typing import List, Optional + +from coderag import __version__ +from coderag.api import CodeRAG +from coderag.config import Config + + +def _build_config(args: argparse.Namespace) -> Config: + overrides: dict = {} + if getattr(args, "watched_dir", None): + overrides["watched_dir"] = Path(args.watched_dir).expanduser() + if getattr(args, "store_dir", None): + overrides["store_dir"] = Path(args.store_dir).expanduser() + if getattr(args, "provider", None): + overrides["provider"] = args.provider + if getattr(args, "model", None): + overrides["model"] = args.model + return Config.from_env(**overrides) + + +# --- commands --- + + +def cmd_index(args: argparse.Namespace) -> int: + cr = CodeRAG(_build_config(args)) + stats = cr.indexer.index( + Path(args.path).expanduser() if args.path else None, + full=args.full, + progress=not args.quiet, + ) + print( + f"Indexed {stats.files_indexed} file(s), skipped {stats.files_skipped}, " + f"removed {stats.files_removed}. " + f"Total: {stats.total_files} files / {stats.total_chunks} chunks." + ) + return 0 + + +def cmd_search(args: argparse.Namespace) -> int: + cr = CodeRAG(_build_config(args)) + hits = cr.search(args.query, top_k=args.k) + if args.json: + print(json.dumps([h.as_dict() for h in hits], indent=2)) + return 0 if hits else 1 + if not hits: + print("No results. Has the codebase been indexed? Try: coderag index") + return 1 + for i, h in enumerate(hits, 1): + label = f" ({h.symbol})" if h.symbol else "" + snippet = textwrap.shorten( + h.text.replace("\n", " "), width=160, placeholder=" …" + ) + print(f"{i}. {h.location}{label} [{h.kind}, sim={h.similarity:.2f}]") + print(f" {snippet}") + if args.answer: + _print_answer(cr, args.query, args.k) + return 0 + + +def _print_answer(cr: CodeRAG, query: str, k: int) -> None: + from coderag.llm import stream_answer + + print("\n--- Answer ---") + try: + for token in stream_answer(cr, query, k): + sys.stdout.write(token) + sys.stdout.flush() + print() + except RuntimeError as exc: + print(f"(LLM answer unavailable: {exc})") + + +def cmd_status(args: argparse.Namespace) -> int: + cr = CodeRAG(_build_config(args)) + print(json.dumps(cr.status(), indent=2)) + return 0 + + +def cmd_watch(args: argparse.Namespace) -> int: + from coderag.watch import watch + + cr = CodeRAG(_build_config(args)) + print(f"Indexing {cr.config.watched_dir} before watching...") + cr.indexer.index(progress=not args.quiet) + watch(cr) + return 0 + + +def cmd_serve(args: argparse.Namespace) -> int: + try: + from coderag.surfaces.http_api import run_server + except ImportError: + print( + "The HTTP server needs extra deps. Install with: pip install 'coderag[server]'" + ) + return 1 + cr = CodeRAG(_build_config(args)) + run_server(cr, host=args.host, port=args.port) + return 0 + + +def cmd_ui(args: argparse.Namespace) -> int: + import subprocess + + app = Path(__file__).with_name("streamlit_app.py") + try: + return subprocess.call( + ["streamlit", "run", str(app), "--", *_passthrough(args)] + ) + except FileNotFoundError: + print("Streamlit is not installed. Install with: pip install 'coderag[ui]'") + return 1 + + +def _passthrough(args: argparse.Namespace) -> List[str]: + out: List[str] = [] + if getattr(args, "watched_dir", None): + out += ["--watched-dir", str(args.watched_dir)] + if getattr(args, "store_dir", None): + out += ["--store-dir", str(args.store_dir)] + return out + + +# --- parser --- + + +def _add_common(p: argparse.ArgumentParser) -> None: + p.add_argument("--watched-dir", help="Codebase root to index/search.") + p.add_argument( + "--store-dir", help="Where the index/database live (default ./.coderag)." + ) + p.add_argument("--provider", help="Embedding provider: fastembed | openai | fake.") + p.add_argument("--model", help="Embedding model name.") + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="coderag", + description="Standalone, local-first semantic code-search engine.", + ) + parser.add_argument("--version", action="version", version=f"coderag {__version__}") + sub = parser.add_subparsers(dest="command", required=True) + + p_index = sub.add_parser( + "index", help="Index (or incrementally update) a codebase." + ) + p_index.add_argument( + "path", nargs="?", help="Path to index (defaults to watched dir)." + ) + p_index.add_argument("--full", action="store_true", help="Force a clean rebuild.") + p_index.add_argument("--quiet", action="store_true", help="Hide the progress bar.") + _add_common(p_index) + p_index.set_defaults(func=cmd_index) + + p_search = sub.add_parser("search", help="Search the indexed codebase.") + p_search.add_argument("query", help="What to search for.") + p_search.add_argument( + "-k", type=int, default=8, help="Number of results (default 8)." + ) + p_search.add_argument("--json", action="store_true", help="Emit JSON.") + p_search.add_argument( + "--answer", + action="store_true", + help="Also stream an LLM answer (needs OpenAI key).", + ) + _add_common(p_search) + p_search.set_defaults(func=cmd_search) + + p_status = sub.add_parser("status", help="Show index statistics.") + _add_common(p_status) + p_status.set_defaults(func=cmd_status) + + p_watch = sub.add_parser( + "watch", help="Index, then keep the index live on changes." + ) + p_watch.add_argument("--quiet", action="store_true") + _add_common(p_watch) + p_watch.set_defaults(func=cmd_watch) + + p_serve = sub.add_parser("serve", help="Run the HTTP/REST API server.") + p_serve.add_argument("--host", default="127.0.0.1") + p_serve.add_argument("--port", type=int, default=8000) + _add_common(p_serve) + p_serve.set_defaults(func=cmd_serve) + + p_ui = sub.add_parser("ui", help="Launch the Streamlit web UI.") + _add_common(p_ui) + p_ui.set_defaults(func=cmd_ui) + + return parser + + +def main(argv: Optional[List[str]] = None) -> int: + logging.basicConfig( + level=logging.INFO, format="%(levelname)s %(name)s: %(message)s" + ) + parser = build_parser() + args = parser.parse_args(argv) + return int(args.func(args)) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/coderag/surfaces/http_api.py b/coderag/surfaces/http_api.py new file mode 100644 index 0000000..dc99880 --- /dev/null +++ b/coderag/surfaces/http_api.py @@ -0,0 +1,74 @@ +"""Self-hostable HTTP/REST API over a CodeRAG instance (optional ``[server]`` extra). + +Lets custom apps, remote frontends, or a shared team deployment query a big codebase over +the network. Endpoints: ``GET /search``, ``POST /index``, ``GET /status``, ``GET /file``. + +Note: this module intentionally does NOT use ``from __future__ import annotations`` — FastAPI +must see the real Pydantic model classes (not stringized annotations) to bind request bodies. +""" + +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from coderag.api import CodeRAG + + +def create_app(cr: "CodeRAG"): + from fastapi import FastAPI, HTTPException, Query + from fastapi.middleware.cors import CORSMiddleware + from pydantic import BaseModel + + app = FastAPI( + title="CodeRAG", + version="1.0.0", + description="Semantic code-search engine.", + ) + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], + ) + + class IndexRequest(BaseModel): + path: Optional[str] = None + full: bool = False + + @app.get("/status") + def status() -> dict: + return cr.status() + + @app.get("/search") + def search( + q: str = Query(..., description="Search query"), + k: int = Query(8, ge=1, le=100), + ) -> dict: + hits = cr.search(q, top_k=k) + return {"query": q, "count": len(hits), "results": [h.as_dict() for h in hits]} + + @app.post("/index") + def index(req: IndexRequest) -> dict: + stats = cr.index(req.path, full=req.full) + return stats.as_dict() + + @app.get("/file") + def get_file( + path: str = Query(...), + start_line: Optional[int] = Query(None, ge=1), + end_line: Optional[int] = Query(None, ge=1), + ) -> dict: + try: + content = cr.get_file(path, start_line, end_line) + except (ValueError, FileNotFoundError) as exc: + raise HTTPException(status_code=404, detail=str(exc)) + return {"path": path, "content": content} + + return app + + +def run_server(cr: "CodeRAG", host: str = "127.0.0.1", port: int = 8000) -> None: + import uvicorn + + # Warm the index/provider so the first request isn't slow. + cr.status() + uvicorn.run(create_app(cr), host=host, port=port) diff --git a/coderag/surfaces/streamlit_app.py b/coderag/surfaces/streamlit_app.py new file mode 100644 index 0000000..1d97a37 --- /dev/null +++ b/coderag/surfaces/streamlit_app.py @@ -0,0 +1,95 @@ +"""Streamlit UI for CodeRAG (optional ``[ui]`` extra). + +Search box + retrieved chunks shown with ``path:line`` citations and similarity scores, an +optional streamed LLM answer, and a sidebar with index status and a reindex button. Launch +via ``coderag ui`` (which runs ``streamlit run`` on this file). +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +import streamlit as st + +from coderag.api import CodeRAG +from coderag.config import Config + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--watched-dir") + parser.add_argument("--store-dir") + args, _ = parser.parse_known_args() + return args + + +@st.cache_resource +def get_engine(watched_dir: str | None, store_dir: str | None) -> CodeRAG: + overrides: dict = {} + if watched_dir: + overrides["watched_dir"] = Path(watched_dir) + if store_dir: + overrides["store_dir"] = Path(store_dir) + return CodeRAG(Config.from_env(**overrides)) + + +def main() -> None: + args = _parse_args() + st.set_page_config(page_title="CodeRAG", page_icon="🔎", layout="wide") + st.title("🔎 CodeRAG") + st.caption("Local-first semantic search over your codebase.") + + cr = get_engine(args.watched_dir, args.store_dir) + + with st.sidebar: + st.header("Index") + try: + status = cr.status() + st.metric("Files", status["total_files"]) + st.metric("Chunks", status["total_chunks"]) + st.write(f"**Model:** `{status['model']}`") + st.write(f"**Index:** `{status['index_type']}`") + st.write(f"**Root:** `{status['watched_dir']}`") + except Exception as exc: # noqa: BLE001 + st.error(f"Could not read index: {exc}") + if st.button("🔄 Reindex"): + with st.spinner("Reindexing..."): + stats = cr.index() + st.success( + f"+{stats.files_indexed} files, {stats.total_chunks} chunks total." + ) + want_answer = st.toggle("Generate LLM answer", value=False) + top_k = st.slider("Results", min_value=1, max_value=20, value=8) + + query = st.text_input("Search", placeholder="e.g. where is retry/backoff handled?") + if not query: + return + + hits = cr.search(query, top_k=top_k) + if not hits: + st.warning( + "No results. Have you indexed this codebase? Use the Reindex button." + ) + return + + if want_answer: + from coderag.llm import stream_answer + + st.subheader("Answer") + try: + st.write_stream(stream_answer(cr, query, top_k)) + except RuntimeError as exc: + st.info(f"LLM answer unavailable: {exc}") + + st.subheader(f"{len(hits)} results") + for hit in hits: + title = f"`{hit.location}`" + if hit.symbol: + title += f" — **{hit.symbol}** ({hit.kind})" + title += f" · sim {hit.similarity:.2f}" + with st.expander(title, expanded=False): + st.code(hit.text, language=hit.language) + + +main() diff --git a/main.py b/main.py deleted file mode 100644 index 7061f1c..0000000 --- a/main.py +++ /dev/null @@ -1,131 +0,0 @@ -import logging -import os -import warnings - -from coderag.config import WATCHED_DIR -from coderag.embeddings import generate_embeddings -from coderag.index import add_to_index, clear_index, save_index -from coderag.monitor import should_ignore_path, start_monitoring - -# Configure comprehensive logging in the entrypoint only -handlers: list[logging.Handler] = [logging.StreamHandler()] -try: - # Enable file logging only if environment allows it - if os.getenv("CODERAG_ENABLE_FILE_LOGS", "1") == "1": - handlers.append(logging.FileHandler("coderag.log", encoding="utf-8")) -except Exception: - # Ignore file handler failures (e.g., read-only FS) - pass - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - handlers=handlers, - force=True, -) - -logger = logging.getLogger(__name__) - -# Suppress transformers warnings -warnings.filterwarnings( - "ignore", category=FutureWarning, module="transformers.tokenization_utils_base" -) - - -def full_reindex() -> int: - """Perform a full reindex of the entire codebase. - - Returns: - Number of files successfully processed - """ - logger.info("Starting full reindexing of the codebase...") - - if not os.path.exists(WATCHED_DIR): - logger.error(f"Watched directory does not exist: {WATCHED_DIR}") - return 0 - - files_processed = 0 - files_failed = 0 - - try: - for root, _, files in os.walk(WATCHED_DIR): - if should_ignore_path(root): - logger.debug(f"Ignoring directory: {root}") - continue - - for file in files: - filepath = os.path.join(root, file) - if should_ignore_path(filepath): - logger.debug(f"Ignoring file: {filepath}") - continue - - if file.endswith(".py"): - logger.debug(f"Processing file: {filepath}") - try: - with open(filepath, "r", encoding="utf-8") as f: - full_content = f.read() - - if not full_content.strip(): - logger.debug(f"Skipping empty file: {filepath}") - continue - - embeddings = generate_embeddings(full_content) - if embeddings is not None: - add_to_index(embeddings, full_content, file, filepath) - files_processed += 1 - else: - logger.warning( - f"Failed to generate embeddings for {filepath}" - ) - files_failed += 1 - - except (IOError, UnicodeDecodeError) as e: - logger.error(f"Error reading file {filepath}: {str(e)}") - files_failed += 1 - except Exception as e: - logger.error( - f"Unexpected error processing file {filepath}: {str(e)}" - ) - files_failed += 1 - - save_index() - logger.info( - f"Full reindexing completed. {files_processed} files processed, " - f"{files_failed} files failed" - ) - return files_processed - - except Exception as e: - logger.error(f"Critical error during reindexing: {str(e)}") - return files_processed - - -def main() -> None: - """Main entry point for the CodeRAG indexing system.""" - try: - logger.info("Starting CodeRAG indexing system") - - # Completely clear the FAISS index and metadata - logger.info("Clearing existing index...") - clear_index() - - # Perform a full reindex of the codebase - logger.info("Starting full reindex...") - processed_files = full_reindex() - - if processed_files == 0: - logger.warning("No files were processed during indexing") - else: - logger.info("Indexing complete. Starting file monitoring...") - # Start monitoring the directory for changes - start_monitoring() - - except KeyboardInterrupt: - logger.info("Received interrupt signal, shutting down gracefully") - except Exception as e: - logger.error(f"Critical error in main: {str(e)}") - raise - - -if __name__ == "__main__": - main() diff --git a/prompt_flow.py b/prompt_flow.py deleted file mode 100644 index b43d129..0000000 --- a/prompt_flow.py +++ /dev/null @@ -1,123 +0,0 @@ -import logging -from typing import Optional as _Optional - -from openai import OpenAI - -from coderag.config import OPENAI_API_KEY, OPENAI_CHAT_MODEL -from coderag.search import search_code - -logger = logging.getLogger(__name__) - -# Initialize OpenAI client with error handling -client: _Optional[OpenAI] -try: - if not OPENAI_API_KEY: - raise ValueError("OpenAI API key not found") - client = OpenAI(api_key=OPENAI_API_KEY) - logger.info(f"OpenAI client initialized with chat model: {OPENAI_CHAT_MODEL}") -except Exception as e: - logger.error(f"Failed to initialize OpenAI client: {e}") - client = None - -SYSTEM_PROMPT = ( - "You are an expert coding assistant. Your task is to help users with their " - "question. Use the retrieved code context to inform your responses, but feel " - "free to suggest better solutions if appropriate." -) - -PRE_PROMPT = ( - "Based on the user's query and the following code context, provide a helpful " - "response. If improvements can be made, suggest them with explanations.\n\n" - "User Query: {query}\n\n" - "Retrieved Code Context:\n{code_context}\n\nYour response:" -) - - -def execute_rag_flow(user_query: str) -> str: - """Execute the RAG flow for answering user queries. - - Args: - user_query: The user's question or request - - Returns: - AI-generated response based on code context - """ - try: - if not client: - logger.error("OpenAI client not initialized") - return ( - "Error: AI service is not available. Please check your " - "OpenAI API key." - ) - - if not user_query or not user_query.strip(): - logger.warning("Empty query received") - return "Please provide a question or request." - - logger.info(f"Processing query: '{user_query[:50]}...'") - - # Perform code search - search_results = search_code(user_query) - - if not search_results: - logger.info("No relevant code found for query") - return ( - "No relevant code found for your query. The codebase might not be " - "indexed yet or your query might be too specific." - ) - - logger.debug(f"Found {len(search_results)} search results") - - # Prepare code context with error handling - try: - code_context = "\n\n".join( - [ - ( - f"File: {result['filename']}\n" - f"Path: {result['filepath']}\n" - # Cosine similarity (IndexFlatIP returns inner product) - f"Similarity: {max(0.0, min(1.0, result['distance'])):.3f}\n" - f"{result['content']}" - ) - for result in search_results[:3] # Limit to top 3 results - ] - ) - except (KeyError, TypeError) as e: - logger.error(f"Error preparing code context: {e}") - return "Error processing search results. Please try again." - - # Construct the full prompt - full_prompt = PRE_PROMPT.format(query=user_query, code_context=code_context) - - # Generate response using OpenAI with error handling - try: - logger.debug("Sending request to OpenAI") - # Rough heuristic: keep total under ~7000 tokens - est_prompt_tokens = max(1, len(full_prompt) // 4) - max_completion = max(256, min(2000, 7000 - est_prompt_tokens)) - response = client.chat.completions.create( - model=OPENAI_CHAT_MODEL, - messages=[ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": full_prompt}, - ], - temperature=0.3, - max_tokens=max_completion, - timeout=60, - ) - - if not response.choices or not response.choices[0].message.content: - logger.error("Empty response from OpenAI") - return "Error: Received empty response from AI service." - - result = response.choices[0].message.content.strip() - logger.info("Successfully generated response") - return result - - except Exception as e: - logger.error(f"OpenAI API error: {str(e)}") - return "Error communicating with AI service. Please try again later." - - except Exception as e: - logger.error(f"Unexpected error in RAG flow: {str(e)}") - return "An unexpected error occurred. Please try again." diff --git a/scripts/initialize_index.py b/scripts/initialize_index.py deleted file mode 100644 index 03ee424..0000000 --- a/scripts/initialize_index.py +++ /dev/null @@ -1,10 +0,0 @@ -from coderag.index import save_index - - -def initialize_index(): - save_index() - print("FAISS index initialized and saved.") - - -if __name__ == "__main__": - initialize_index() diff --git a/scripts/run_monitor.py b/scripts/run_monitor.py deleted file mode 100644 index 9e0dcd5..0000000 --- a/scripts/run_monitor.py +++ /dev/null @@ -1,4 +0,0 @@ -from coderag.monitor import start_monitoring - -if __name__ == "__main__": - start_monitoring() From 06556547aba79dc13e875fdefbab58a91c949115 Mon Sep 17 00:00:00 2001 From: fastsoab Date: Mon, 1 Jun 2026 20:00:45 +0200 Subject: [PATCH 3/5] test: add offline, deterministic test suite (54 tests) Covers config/providers, SQLite store + Flat/IVF vector index, chunking across languages, incremental indexing + no-duplicate invariant, RRF + hybrid search, and the CLI/HTTP/watcher surfaces. Default run is fully offline via a deterministic fake embedder; the real fastembed model is exercised only under -m integration. Removes the old smoke tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/conftest.py | 39 ++++++++ tests/test_chunking.py | 121 ++++++++++++++++++++++++ tests/test_config_and_providers.py | 74 +++++++++++++++ tests/test_faiss.py | 53 ----------- tests/test_index.py | 42 --------- tests/test_indexer.py | 104 +++++++++++++++++++++ tests/test_real_providers.py | 60 ++++++++++++ tests/test_retrieval.py | 81 ++++++++++++++++ tests/test_search.py | 57 ------------ tests/test_store.py | 145 +++++++++++++++++++++++++++++ tests/test_surfaces.py | 130 ++++++++++++++++++++++++++ 11 files changed, 754 insertions(+), 152 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/test_chunking.py create mode 100644 tests/test_config_and_providers.py delete mode 100644 tests/test_faiss.py delete mode 100644 tests/test_index.py create mode 100644 tests/test_indexer.py create mode 100644 tests/test_real_providers.py create mode 100644 tests/test_retrieval.py delete mode 100644 tests/test_search.py create mode 100644 tests/test_store.py create mode 100644 tests/test_surfaces.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..addc903 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,39 @@ +"""Shared pytest fixtures. Everything here is offline and deterministic. + +The default embedding provider for tests is the ``fake`` provider, so the suite never +downloads a model or touches the network. Real backends are exercised only by tests +marked ``@pytest.mark.integration`` (deselected in CI). +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from coderag.config import Config + + +@pytest.fixture +def config(tmp_path: Path) -> Config: + """A fake-provider config rooted at an isolated tmp dir.""" + return Config( + provider="fake", + watched_dir=tmp_path / "repo", + store_dir=tmp_path / "store", + ivf_threshold=20, # tiny so IVF-path tests don't need huge corpora + ) + + +@pytest.fixture +def repo(tmp_path: Path) -> Path: + """An empty repo directory under tmp.""" + d = tmp_path / "repo" + d.mkdir(parents=True, exist_ok=True) + return d + + +def write(path: Path, content: str) -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + return path diff --git a/tests/test_chunking.py b/tests/test_chunking.py new file mode 100644 index 0000000..b5c254a --- /dev/null +++ b/tests/test_chunking.py @@ -0,0 +1,121 @@ +"""P2 tests: symbol-aware chunking for Python + tree-sitter languages, plus fallbacks.""" + +from __future__ import annotations + +from coderag.chunking import chunk_file +from coderag.chunking.languages import detect_language +from coderag.config import Config + +CFG = Config(provider="fake", window_lines=10, window_overlap=2, max_chunk_lines=50) + + +def _symbols(chunks): + return {c.symbol for c in chunks if c.symbol} + + +def test_detect_language(): + assert detect_language("a.py") == "python" + assert detect_language("a.tsx") == "tsx" + assert detect_language("a.rs") == "rust" + assert detect_language("a.unknownext") is None + + +def test_python_functions_and_methods(): + src = ( + "import os\n" + "\n" + "def top_level():\n" + " return 1\n" + "\n" + "class Greeter:\n" + ' """A greeter."""\n' + " def hello(self):\n" + " return 'hi'\n" + "\n" + " def bye(self):\n" + " return 'bye'\n" + ) + chunks = chunk_file(src, "python", CFG) + syms = _symbols(chunks) + assert "top_level" in syms + assert "Greeter" in syms + assert "Greeter.hello" in syms + assert "Greeter.bye" in syms + # method chunk should contain its body and nothing from the sibling method + hello = next(c for c in chunks if c.symbol == "Greeter.hello") + assert "return 'hi'" in hello.text + assert "return 'bye'" not in hello.text + + +def test_python_decorator_included_in_span(): + src = "@property\ndef name(self):\n return self._n\n" + chunks = chunk_file(src, "python", CFG) + fn = next(c for c in chunks if c.symbol == "name") + assert fn.start_line == 1 + assert "@property" in fn.text + + +def test_python_syntax_error_falls_back_to_windows(): + src = "def broken(:\n this is not python\n" * 5 + chunks = chunk_file(src, "python", CFG) + assert chunks # did not crash + assert all(c.kind == "window" for c in chunks) + + +def test_non_overlapping_coverage(): + src = "\n".join(f"x{i} = {i}" for i in range(40)) + chunks = chunk_file(src, "python", CFG) + # windows may overlap by design, but symbol chunks must not duplicate lines wildly + assert chunks + assert all(c.start_line <= c.end_line for c in chunks) + + +def test_oversized_symbol_is_split(): + body = "\n".join(f" a{i} = {i}" for i in range(120)) + src = f"def huge():\n{body}\n" + chunks = chunk_file(src, "python", CFG) + huge = [c for c in chunks if c.symbol == "huge"] + assert len(huge) > 1 # split into multiple windows + + +def test_javascript_symbols(): + src = ( + "function add(a, b) {\n return a + b;\n}\n\n" + "class Counter {\n inc() { this.n++; }\n}\n" + ) + chunks = chunk_file(src, "javascript", CFG) + syms = _symbols(chunks) + assert "add" in syms + assert "Counter" in syms + assert "inc" in syms + + +def test_go_symbols(): + src = ( + "package main\n\n" + "func Add(a, b int) int {\n\treturn a + b\n}\n\n" + "type Point struct {\n\tX int\n}\n" + ) + chunks = chunk_file(src, "go", CFG) + syms = _symbols(chunks) + assert "Add" in syms + assert "Point" in syms + + +def test_rust_symbols(): + src = 'fn main() {\n println!("hi");\n}\n\nstruct Foo {\n x: i32,\n}\n' + chunks = chunk_file(src, "rust", CFG) + syms = _symbols(chunks) + assert "main" in syms + assert "Foo" in syms + + +def test_unknown_language_uses_windows(): + src = "\n".join(f"line {i}" for i in range(30)) + chunks = chunk_file(src, "text", CFG) + assert chunks + assert all(c.kind == "window" for c in chunks) + + +def test_empty_file_yields_nothing(): + assert chunk_file(" \n \n", "python", CFG) == [] diff --git a/tests/test_config_and_providers.py b/tests/test_config_and_providers.py new file mode 100644 index 0000000..3bf1ca0 --- /dev/null +++ b/tests/test_config_and_providers.py @@ -0,0 +1,74 @@ +"""P0 scaffolding tests: Config behaviour and the embedding provider abstraction.""" + +from __future__ import annotations + +import numpy as np + +from coderag.config import Config +from coderag.embeddings import EmbeddingProvider, get_provider + + +def test_config_defaults_and_derived_paths(tmp_path): + cfg = Config(store_dir=tmp_path / ".coderag") + assert cfg.provider == "fastembed" + assert cfg.db_path == tmp_path / ".coderag" / "coderag.db" + assert cfg.faiss_path == tmp_path / ".coderag" / "index.faiss" + + +def test_config_is_immutable_and_copies(): + cfg = Config() + updated = cfg.with_overrides(top_k=42) + assert updated.top_k == 42 + assert cfg.top_k == 8 # original untouched + + +def test_from_env_reads_and_overrides(monkeypatch, tmp_path): + monkeypatch.setenv("CODERAG_PROVIDER", "fake") + monkeypatch.setenv("CODERAG_TOP_K", "3") + cfg = Config.from_env(store_dir=tmp_path) + assert cfg.provider == "fake" + assert cfg.top_k == 3 + assert cfg.store_dir == tmp_path # explicit override wins + + +def test_from_env_ignores_bad_ints(monkeypatch): + monkeypatch.setenv("CODERAG_TOP_K", "not-a-number") + cfg = Config.from_env() + assert cfg.top_k == 8 # falls back to default + + +def test_fake_provider_conforms_to_protocol(): + provider = get_provider(Config(provider="fake")) + assert isinstance(provider, EmbeddingProvider) + assert provider.dim == 16 + + +def test_fake_provider_is_deterministic_and_normalized(): + provider = get_provider(Config(provider="fake")) + a = provider.embed_documents(["def foo(): pass", "class Bar: ..."]) + b = provider.embed_documents(["def foo(): pass", "class Bar: ..."]) + assert a.shape == (2, provider.dim) + assert a.dtype == np.dtype("float32") + np.testing.assert_array_equal(a, b) # deterministic + norms = np.linalg.norm(a, axis=1) + np.testing.assert_allclose(norms, 1.0, atol=1e-5) # unit vectors + + +def test_fake_provider_query_matches_identical_document(): + provider = get_provider(Config(provider="fake")) + q = provider.embed_query("hello world") + d = provider.embed_documents(["hello world"])[0] + np.testing.assert_allclose(q, d) + + +def test_empty_documents_returns_empty_array(): + provider = get_provider(Config(provider="fake")) + out = provider.embed_documents([]) + assert out.shape == (0, provider.dim) + + +def test_unknown_provider_raises(): + import pytest + + with pytest.raises(ValueError): + get_provider(Config(provider="bogus")) diff --git a/tests/test_faiss.py b/tests/test_faiss.py deleted file mode 100644 index 00a155a..0000000 --- a/tests/test_faiss.py +++ /dev/null @@ -1,53 +0,0 @@ -import faiss -import numpy as np - -from coderag.config import EMBEDDING_DIM -from coderag.index import ( - add_to_index, - clear_index, - inspect_metadata, - load_index, - retrieve_vectors, - save_index, -) - - -def _isolate_index(tmp_path, monkeypatch): - monkeypatch.chdir(tmp_path) - monkeypatch.setattr( - "coderag.index.FAISS_INDEX_FILE", - str(tmp_path / "coderag_index.faiss"), - raising=False, - ) - monkeypatch.setattr("coderag.index.metadata", [], raising=False) - monkeypatch.setattr( - "coderag.index.index", - faiss.IndexFlatIP(EMBEDDING_DIM), - raising=False, - ) - - -def test_faiss_index(tmp_path, monkeypatch): - _isolate_index(tmp_path, monkeypatch) - # Clear the index before testing - clear_index() - - # Create a deterministic dummy embedding (no network needed) - vec = np.ones((1, EMBEDDING_DIM), dtype=np.float32) - # Add to index with small dummy content - add_to_index(vec, "dummy content", "test_file.py", "test_file.py") - save_index() - - # Load the index - index = load_index() - assert index is not None, "Failed to load FAISS index." - # Check if index has vectors - assert index.ntotal > 0, "FAISS index is empty. No vectors found!" - print(f"FAISS index has {index.ntotal} vectors.") - - # Retrieve and inspect vectors - vectors = retrieve_vectors(5) - print(f"Retrieved {len(vectors)} vectors from the index.") - - # Inspect metadata - inspect_metadata(5) diff --git a/tests/test_index.py b/tests/test_index.py deleted file mode 100644 index 9c900ea..0000000 --- a/tests/test_index.py +++ /dev/null @@ -1,42 +0,0 @@ -import os - -import faiss -import numpy as np - -from coderag.config import EMBEDDING_DIM -from coderag.index import add_to_index, clear_index, get_metadata, save_index - - -def _isolate_index(tmp_path, monkeypatch): - monkeypatch.chdir(tmp_path) - monkeypatch.setattr("coderag.index.WATCHED_DIR", str(tmp_path), raising=False) - monkeypatch.setattr( - "coderag.index.FAISS_INDEX_FILE", - str(tmp_path / "coderag_index.faiss"), - raising=False, - ) - monkeypatch.setattr("coderag.index.metadata", [], raising=False) - monkeypatch.setattr( - "coderag.index.index", - faiss.IndexFlatIP(EMBEDDING_DIM), - raising=False, - ) - - -def test_add_to_index_tracks_relative_paths(tmp_path, monkeypatch): - _isolate_index(tmp_path, monkeypatch) - - clear_index() - - embeddings = np.zeros((1, EMBEDDING_DIM), dtype=np.float32) - file_path = tmp_path / "pkg" / "module.py" - add_to_index(embeddings, "print('hi')", "module.py", str(file_path)) - - metadata = get_metadata() - assert metadata - assert metadata[0]["filepath"] == os.path.join("pkg", "module.py") - - save_index() - assert os.path.exists(tmp_path / "coderag_index.faiss") - - clear_index() diff --git a/tests/test_indexer.py b/tests/test_indexer.py new file mode 100644 index 0000000..2c943a3 --- /dev/null +++ b/tests/test_indexer.py @@ -0,0 +1,104 @@ +"""P3 tests: incremental indexing, the no-duplicate invariant, and pruning.""" + +from __future__ import annotations + +from coderag.api import CodeRAG +from tests.conftest import write + + +def _cr(config) -> CodeRAG: + config.watched_dir.mkdir(parents=True, exist_ok=True) + return CodeRAG(config) + + +def test_index_creates_chunks(config): + cr = _cr(config) + write(config.watched_dir / "a.py", "def alpha():\n return 1\n") + write(config.watched_dir / "b.py", "def beta():\n return 2\n") + stats = cr.index() + assert stats.files_indexed == 2 + assert stats.total_chunks >= 2 + assert cr.vectors.ntotal == stats.total_chunks + + +def test_unchanged_files_are_skipped(config): + cr = _cr(config) + write(config.watched_dir / "a.py", "def alpha():\n return 1\n") + cr.index() + stats2 = cr.index() # nothing changed + assert stats2.files_indexed == 0 + assert stats2.files_skipped == 1 + + +def test_editing_a_file_does_not_duplicate(config): + cr = _cr(config) + path = config.watched_dir / "a.py" + write(path, "def alpha():\n return 1\n") + cr.index() + chunks_before = cr.store.total_chunks() + vectors_before = cr.vectors.ntotal + assert chunks_before == vectors_before + + # Edit and reindex. + write(path, "def alpha():\n return 100\n\ndef gamma():\n return 3\n") + stats = cr.index() + assert stats.chunks_removed >= 1 # old chunks were deleted first + # Store and FAISS stay in lock-step (no stale/duplicate vectors). + assert cr.store.total_chunks() == cr.vectors.ntotal + # The new content is searchable; the stale content is gone. + rows = cr.store.hydrate( + cr.store.chunk_ids_for_file(cr.store.get_file("a.py")["id"]) + ) + joined = "\n".join(r["text"] for r in rows.values()) + assert "return 100" in joined + assert "return 1\n" not in joined or "return 100" in joined + + +def test_deleted_file_is_pruned(config): + cr = _cr(config) + a = config.watched_dir / "a.py" + b = config.watched_dir / "b.py" + write(a, "def alpha():\n return 1\n") + write(b, "def beta():\n return 2\n") + cr.index() + assert cr.store.total_chunks() == cr.vectors.ntotal + + b.unlink() + stats = cr.index() + assert stats.files_removed == 1 + assert "b.py" not in cr.store.all_file_paths() + assert cr.store.total_chunks() == cr.vectors.ntotal + + +def test_ignored_dirs_are_skipped(config): + cr = _cr(config) + write(config.watched_dir / "src" / "a.py", "def alpha():\n return 1\n") + write(config.watched_dir / "node_modules" / "x.js", "function x(){return 1;}\n") + write(config.watched_dir / ".git" / "hooks.py", "def hook():\n return 1\n") + cr.index() + paths = cr.store.all_file_paths() + assert "src/a.py" in paths + assert not any("node_modules" in p for p in paths) + assert not any(".git" in p for p in paths) + + +def test_full_rebuild_resets(config): + cr = _cr(config) + write(config.watched_dir / "a.py", "def alpha():\n return 1\n") + cr.index() + n1 = cr.store.total_chunks() + stats = cr.index(full=True) + assert stats.total_chunks == n1 # same content, rebuilt cleanly + assert cr.store.total_chunks() == cr.vectors.ntotal + + +def test_index_survives_reopen(config, tmp_path): + cr = _cr(config) + write(config.watched_dir / "a.py", "def alpha():\n return 1\n") + cr.index() + n = cr.store.total_chunks() + cr.close() + + cr2 = CodeRAG(config) + assert cr2.store.total_chunks() == n + assert cr2.vectors.ntotal == n # FAISS cache reloaded, consistent diff --git a/tests/test_real_providers.py b/tests/test_real_providers.py new file mode 100644 index 0000000..b0dadd7 --- /dev/null +++ b/tests/test_real_providers.py @@ -0,0 +1,60 @@ +"""P5 tests: the real embedding backends. + +The fastembed test is marked ``integration`` (downloads a model) and is deselected in CI. +The OpenAI test mocks the SDK client so it never hits the network. +""" + +from __future__ import annotations + +import types + +import numpy as np +import pytest + +from coderag.embeddings.openai_provider import OpenAIEmbeddingProvider + + +def test_openai_provider_batches_without_averaging(monkeypatch): + calls = {"inputs": []} + + class _Resp: + def __init__(self, n, dim): + self.data = [ + types.SimpleNamespace(embedding=[float(i)] * dim) for i in range(n) + ] + + class _Embeddings: + def create(self, model, input, timeout): + calls["inputs"].append(list(input)) + return _Resp(len(input), 4) + + class _Client: + embeddings = _Embeddings() + + provider = OpenAIEmbeddingProvider(model="text-embedding-3-small", api_key="k") + monkeypatch.setattr(type(provider), "_client", property(lambda self: _Client())) + + vecs = provider.embed_documents(["a", "b", "c"]) + assert vecs.shape == (3, 4) # one vector per chunk, NOT averaged into one + assert vecs.dtype == np.dtype("float32") + assert calls["inputs"][0] == ["a", "b", "c"] + + +def test_openai_known_dim_without_network(): + provider = OpenAIEmbeddingProvider(model="text-embedding-3-small", api_key="k") + assert provider.dim == 1536 # resolved from the known-dims map, no API call + + +@pytest.mark.integration +def test_fastembed_end_to_end(): + from coderag.embeddings.fastembed_provider import FastEmbedProvider + + provider = FastEmbedProvider() + assert provider.dim == 384 + docs = provider.embed_documents(["def add(a, b): return a + b", "hello world"]) + assert docs.shape == (2, 384) + q = provider.embed_query("how to add two numbers") + assert q.shape == (384,) + # The code doc should be more similar to the query than the unrelated doc. + sims = docs @ (q / np.linalg.norm(q)) + assert sims[0] > sims[1] diff --git a/tests/test_retrieval.py b/tests/test_retrieval.py new file mode 100644 index 0000000..75cc26f --- /dev/null +++ b/tests/test_retrieval.py @@ -0,0 +1,81 @@ +"""P4 tests: RRF fusion and end-to-end hybrid search.""" + +from __future__ import annotations + +from coderag.api import CodeRAG +from coderag.retrieval.fusion import reciprocal_rank_fusion +from tests.conftest import write + + +def test_rrf_merges_and_dedupes(): + dense = [1, 2, 3] + lexical = [3, 4, 1] + fused = reciprocal_rank_fusion([dense, lexical], k=60) + ids = [i for i, _ in fused] + # ids appearing in both lists should rank above singletons + assert set(ids) == {1, 2, 3, 4} + assert ids[0] in (1, 3) # shared, high-ranked items win + + +def test_rrf_respects_weights(): + a = [10, 11] + b = [20, 21] + fused = reciprocal_rank_fusion([a, b], k=60, weights=[5.0, 1.0]) + assert fused[0][0] == 10 # heavily-weighted list dominates + + +def test_rrf_empty(): + assert reciprocal_rank_fusion([[], []]) == [] + + +def _indexed(config) -> CodeRAG: + config.watched_dir.mkdir(parents=True, exist_ok=True) + write( + config.watched_dir / "auth.py", + "def authenticate_user(token):\n" + " '''Validate a session token and return the user.'''\n" + " return verify(token)\n", + ) + write( + config.watched_dir / "math_utils.py", + "def add_numbers(a, b):\n return a + b\n", + ) + cr = CodeRAG(config) + cr.index() + return cr + + +def test_search_finds_relevant_symbol(config): + cr = _indexed(config) + hits = cr.search("authenticate_user", top_k=3) + assert hits + assert hits[0].path == "auth.py" + assert hits[0].symbol == "authenticate_user" + assert hits[0].location == "auth.py:1" + + +def test_lexical_recall_via_fts(config): + # An exact identifier that should be found even if dense recall is weak. + cr = _indexed(config) + hits = cr.search("add_numbers", top_k=3) + assert any(h.path == "math_utils.py" for h in hits) + + +def test_search_empty_query(config): + cr = _indexed(config) + assert cr.search(" ", top_k=3) == [] + + +def test_search_returns_scores_and_similarity(config): + cr = _indexed(config) + hits = cr.search("authenticate_user", top_k=3) + assert all(h.score > 0 for h in hits) + assert all(0.0 <= h.similarity <= 1.0 for h in hits) + + +def test_hits_are_serializable(config): + cr = _indexed(config) + hits = cr.search("token", top_k=2) + for h in hits: + d = h.as_dict() + assert "path" in d and "location" in d and "score" in d diff --git a/tests/test_search.py b/tests/test_search.py deleted file mode 100644 index 0535b07..0000000 --- a/tests/test_search.py +++ /dev/null @@ -1,57 +0,0 @@ -import faiss -import numpy as np - -from coderag.config import EMBEDDING_DIM -from coderag.index import add_to_index, clear_index, save_index -from coderag.search import search_code - - -def _isolate_index(tmp_path, monkeypatch): - monkeypatch.chdir(tmp_path) - monkeypatch.setattr( - "coderag.index.FAISS_INDEX_FILE", - str(tmp_path / "coderag_index.faiss"), - raising=False, - ) - monkeypatch.setattr("coderag.index.metadata", [], raising=False) - monkeypatch.setattr( - "coderag.index.index", - faiss.IndexFlatIP(EMBEDDING_DIM), - raising=False, - ) - - -def test_search_returns_result(tmp_path, monkeypatch): - _isolate_index(tmp_path, monkeypatch) - clear_index() - - vector = np.ones((1, EMBEDDING_DIM), dtype=np.float32) - add_to_index(vector, "print('hello world')", "sample.py", "sample.py") - save_index() - - def fake_generate_embeddings(_: str): - embedding = np.ones((1, EMBEDDING_DIM), dtype=np.float32) - faiss.normalize_L2(embedding) - return embedding - - monkeypatch.setattr("coderag.search.generate_embeddings", fake_generate_embeddings) - - results = search_code("hello", k=1) - assert len(results) == 1 - first = results[0] - assert first["filename"] == "sample.py" - assert "sample.py" in first["filepath"] - assert first["distance"] > 0 - - clear_index() - - -def test_search_empty_query_returns_empty(): - assert search_code(" ") == [] - - -def test_search_missing_index_returns_empty(tmp_path, monkeypatch): - _isolate_index(tmp_path, monkeypatch) - clear_index() - results = search_code("whatever") - assert results == [] diff --git a/tests/test_store.py b/tests/test_store.py new file mode 100644 index 0000000..954a059 --- /dev/null +++ b/tests/test_store.py @@ -0,0 +1,145 @@ +"""P1 tests: SQLite store + pluggable FAISS vector index.""" + +from __future__ import annotations + +import numpy as np + +from coderag.config import Config +from coderag.store.sqlite_store import SQLiteStore +from coderag.store.vector_index import FaissVectorIndex +from coderag.types import Chunk + + +def _store(tmp_path) -> SQLiteStore: + store = SQLiteStore(tmp_path / "coderag.db") + store.bootstrap(embed_dim=16, embed_model="fake-16") + return store + + +def _chunk(text: str, start: int = 1) -> Chunk: + return Chunk( + text=text, + start_line=start, + end_line=start + 2, + language="python", + symbol="f", + kind="function", + ) + + +def test_add_and_hydrate_chunks(tmp_path): + store = _store(tmp_path) + fid = store.upsert_file("a.py", "python", "hash1", 1.0) + vecs = np.ones((2, 16), dtype="float32") + ids = store.add_chunks( + fid, [_chunk("def f(): pass"), _chunk("x = 1", 5)], vecs, "fake-16" + ) + assert len(ids) == 2 + rows = store.hydrate(ids) + assert rows[ids[0]]["path"] == "a.py" + assert rows[ids[0]]["text"] == "def f(): pass" + + +def test_autoincrement_ids_never_reused(tmp_path): + store = _store(tmp_path) + fid = store.upsert_file("a.py", "python", "h", 1.0) + vecs = np.ones((1, 16), dtype="float32") + first = store.add_chunks(fid, [_chunk("a")], vecs, "fake-16") + store.delete_chunks_for_file(fid) + second = store.add_chunks(fid, [_chunk("b")], vecs, "fake-16") + assert second[0] > first[0] # id advanced, not recycled + + +def test_fts_search_finds_token_and_survives_operators(tmp_path): + store = _store(tmp_path) + fid = store.upsert_file("a.py", "python", "h", 1.0) + vecs = np.ones((1, 16), dtype="float32") + store.add_chunks(fid, [_chunk("def parse_config(): return 1")], vecs, "fake-16") + hits = store.fts_search("parse_config", limit=5) + assert len(hits) == 1 + # Operators in the query must not raise. + assert store.fts_search("parse_config::*", limit=5) + assert store.fts_search("", limit=5) == [] + + +def test_iter_vectors_round_trips(tmp_path): + store = _store(tmp_path) + fid = store.upsert_file("a.py", "python", "h", 1.0) + vecs = np.random.default_rng(0).standard_normal((3, 16)).astype("float32") + ids = store.add_chunks( + fid, [_chunk("a"), _chunk("b"), _chunk("c")], vecs, "fake-16" + ) + got_ids, got_vecs = next(store.iter_vectors()) + assert list(got_ids) == ids + np.testing.assert_allclose(got_vecs, vecs) + + +def test_model_change_triggers_rebuild_flag(tmp_path): + store = SQLiteStore(tmp_path / "coderag.db") + assert store.bootstrap(16, "fake-16") is False + store.upsert_file("a.py", "python", "h", 1.0) + # Re-bootstrap with a different dim/model: should clear and request rebuild. + assert store.bootstrap(384, "bge-small") is True + assert store.all_file_paths() == [] + + +def _vec_index(tmp_path, **cfg) -> tuple: + config = Config(store_dir=tmp_path, **cfg) + store = _store(tmp_path) + idx = FaissVectorIndex.open(config, dim=16) + return config, store, idx + + +def test_vector_add_search_remove(tmp_path): + _, _, idx = _vec_index(tmp_path) + rng = np.random.default_rng(1) + vecs = rng.standard_normal((5, 16)).astype("float32") + ids = np.array([10, 20, 30, 40, 50], dtype="int64") + idx.add(ids, vecs) + assert idx.ntotal == 5 + got_ids, scores = idx.search(vecs[2], k=3) + assert got_ids[0] == 30 # closest to itself + assert scores[0] > 0.99 + removed = idx.remove([30]) + assert removed == 1 + got_ids, _ = idx.search(vecs[2], k=3) + assert 30 not in got_ids + + +def test_rebuild_from_store_and_consistency(tmp_path): + config, store, idx = _vec_index(tmp_path) + fid = store.upsert_file("a.py", "python", "h", 1.0) + vecs = np.random.default_rng(2).standard_normal((4, 16)).astype("float32") + store.add_chunks(fid, [_chunk(str(i)) for i in range(4)], vecs, "fake-16") + # Index is empty but store has 4 chunks -> ensure_consistent rebuilds. + idx.ensure_consistent(store) + assert idx.ntotal == 4 + assert idx.kind == "flat" + + +def test_auto_upgrade_flat_to_ivf(tmp_path): + # ivf_threshold tiny so a small corpus crosses it. + config, store, idx = _vec_index(tmp_path, ivf_threshold=10) + fid = store.upsert_file("a.py", "python", "h", 1.0) + n = 30 + vecs = np.random.default_rng(3).standard_normal((n, 16)).astype("float32") + ids = store.add_chunks(fid, [_chunk(str(i)) for i in range(n)], vecs, "fake-16") + idx.add(np.array(ids, dtype="int64"), vecs) + assert idx.kind == "flat" + upgraded = idx.maybe_upgrade(store) + assert upgraded is True + assert idx.kind == "ivf" + assert idx.ntotal == n + # IVF still returns the self-match. + got_ids, _ = idx.search(vecs[0], k=1) + assert got_ids[0] == ids[0] + + +def test_index_persists_across_open(tmp_path): + config, store, idx = _vec_index(tmp_path) + vecs = np.random.default_rng(4).standard_normal((3, 16)).astype("float32") + idx.add(np.array([1, 2, 3], dtype="int64"), vecs) + idx.save() + reopened = FaissVectorIndex.open(config, dim=16) + assert reopened.ntotal == 3 + assert reopened.kind == "flat" diff --git a/tests/test_surfaces.py b/tests/test_surfaces.py new file mode 100644 index 0000000..222affb --- /dev/null +++ b/tests/test_surfaces.py @@ -0,0 +1,130 @@ +"""P6 tests: CLI, HTTP API, and watcher behaviour (all with the fake provider).""" + +from __future__ import annotations + +import json + +import pytest + +from coderag.api import CodeRAG +from coderag.surfaces.cli import main as cli_main +from tests.conftest import write + + +@pytest.fixture +def repo_with_code(tmp_path, monkeypatch): + repo = tmp_path / "repo" + store = tmp_path / "store" + write(repo / "auth.py", "def authenticate(token):\n return token == 'ok'\n") + monkeypatch.setenv("CODERAG_PROVIDER", "fake") + common = ["--watched-dir", str(repo), "--store-dir", str(store)] + return repo, store, common + + +# --- CLI --- + + +def test_cli_index_then_search(repo_with_code, capsys): + repo, store, common = repo_with_code + assert cli_main(["index", "--quiet", *common]) == 0 + assert "Indexed" in capsys.readouterr().out + + assert cli_main(["search", "authenticate", "-k", "3", *common]) == 0 + out = capsys.readouterr().out + assert "auth.py:1" in out + + +def test_cli_search_json(repo_with_code, capsys): + repo, store, common = repo_with_code + cli_main(["index", "--quiet", *common]) + capsys.readouterr() + rc = cli_main(["search", "authenticate", "--json", *common]) + payload = json.loads(capsys.readouterr().out) + assert rc == 0 + assert payload[0]["path"] == "auth.py" + + +def test_cli_status(repo_with_code, capsys): + repo, store, common = repo_with_code + cli_main(["index", "--quiet", *common]) + capsys.readouterr() + cli_main(["status", *common]) + status = json.loads(capsys.readouterr().out) + assert status["provider"] == "fake" + assert status["total_files"] == 1 + + +def test_cli_search_without_index(repo_with_code, capsys): + repo, store, common = repo_with_code + rc = cli_main(["search", "anything", *common]) + assert rc == 1 + assert "No results" in capsys.readouterr().out + + +# --- HTTP API --- + + +def test_http_api_search_and_status(repo_with_code): + from fastapi.testclient import TestClient + + from coderag.surfaces.http_api import create_app + + repo, store, _ = repo_with_code + from coderag.config import Config + + cr = CodeRAG(Config(provider="fake", watched_dir=repo, store_dir=store)) + cr.index() + client = TestClient(create_app(cr)) + + r = client.get("/status") + assert r.status_code == 200 + assert r.json()["total_files"] == 1 + + r = client.get("/search", params={"q": "authenticate", "k": 3}) + body = r.json() + assert body["count"] >= 1 + assert body["results"][0]["path"] == "auth.py" + + r = client.get("/file", params={"path": "auth.py"}) + assert "authenticate" in r.json()["content"] + + r = client.get("/file", params={"path": "../../etc/passwd"}) + assert r.status_code == 404 # path traversal blocked + + +def test_http_index_endpoint(repo_with_code): + from fastapi.testclient import TestClient + + from coderag.config import Config + from coderag.surfaces.http_api import create_app + + repo, store, _ = repo_with_code + cr = CodeRAG(Config(provider="fake", watched_dir=repo, store_dir=store)) + client = TestClient(create_app(cr)) + r = client.post("/index", json={"full": False}) + assert r.status_code == 200 + assert r.json()["total_files"] == 1 + + +# --- watcher --- + + +def test_watcher_apply_handles_edit_and_delete(repo_with_code): + from coderag.config import Config + from coderag.watch import _apply + + repo, store, _ = repo_with_code + cr = CodeRAG(Config(provider="fake", watched_dir=repo, store_dir=store)) + cr.index() + n0 = cr.store.total_chunks() + + new = repo / "extra.py" + write(new, "def extra():\n return 1\n") + _apply(cr, str(new)) + assert cr.store.total_chunks() > n0 + assert cr.store.total_chunks() == cr.vectors.ntotal + + new.unlink() + _apply(cr, str(new)) + assert "extra.py" not in cr.store.all_file_paths() + assert cr.store.total_chunks() == cr.vectors.ntotal From b243e1592060bc3d105738e63674f6d207390530 Mon Sep 17 00:00:00 2001 From: fastsoab Date: Mon, 1 Jun 2026 20:00:55 +0200 Subject: [PATCH 4/5] docs+ci: reposition as standalone engine, rewrite README/AGENTS/DEVELOPMENT, modernize CI - README: drop the 'made obsolete by Cursor' framing; present CodeRAG as a standalone, local-first semantic code-search engine for large/custom codebases, with CLI / library / HTTP / UI quickstarts and an architecture diagram. - AGENTS.md / DEVELOPMENT.md: document the new module layout and design invariants. - pyproject: v1.0.0, new deps (fastembed, tree-sitter grammars, tqdm), extras (server/ui/openai), single 'coderag' entry point, pytest integration marker. - CI: 3.11/3.12 matrix running black/isort/flake8/mypy + offline pytest (-m 'not integration'). - example.env rewritten around CODERAG_* config; .flake8 added; tooling and .gitignore updated. Co-Authored-By: Claude Opus 4.8 (1M context) --- .flake8 | 5 + .github/workflows/ci-tests.yml | 83 ++++------ .gitignore | 11 ++ .pre-commit-config.yaml | 1 - AGENTS.md | 55 ++++--- DEVELOPMENT.md | 234 +++++++---------------------- README.md | 266 +++++++++++++++------------------ example.env | 37 +++-- pyproject.toml | 43 +++++- readme.rst | 134 ----------------- requirements.txt | 2 +- 11 files changed, 319 insertions(+), 552 deletions(-) create mode 100644 .flake8 delete mode 100644 readme.rst diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..3c4742b --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +[flake8] +# Black formats code to 88; allow a little slack for prose docstrings/comments. +max-line-length = 100 +extend-ignore = E203, W503 +exclude = .git, __pycache__, build, dist, venv, .venv, env diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index d9c0306..1fc23ee 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -1,69 +1,52 @@ -name: CI Tests +name: CI on: push: - branches: [ main, master, develop ] + branches: [main, master, develop] pull_request: - branches: [ main, master, develop ] + branches: [main, master, develop] schedule: - cron: "0 3 * * *" jobs: - test-imports: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python 3.11 - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Cache pip dependencies - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip- - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - - - name: Test Import Structure - run: | - python -c "import coderag.config; print('✓ Config import successful')" - python -c "import coderag.embeddings; print('✓ Embeddings import successful')" - python -c "import coderag.index; print('✓ Index import successful')" - python -c "import coderag.search; print('✓ Search import successful')" - python -c "import coderag.monitor; print('✓ Monitor import successful')" - env: - OPENAI_API_KEY: dummy-key-for-testing - quality-and-tests: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.11", "3.12"] + steps: - uses: actions/checkout@v4 - - name: Set up Python 3.11 + + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: - python-version: '3.11' - - name: Install dependencies + python-version: ${{ matrix.python-version }} + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-py${{ matrix.python-version }}-pip-${{ hashFiles('pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-py${{ matrix.python-version }}-pip- + + - name: Install run: | python -m pip install --upgrade pip - pip install -r requirements.txt - - name: Format check + pip install -e ".[dev,server,openai]" + + - name: Format check (black, isort) run: | black --check . isort --check-only . - - name: Lint - run: flake8 . --max-line-length=88 --ignore=E203,W503 - - name: Type check - run: mypy . - - name: Run tests - env: - PYTHONPATH: ${{ github.workspace }} - run: pytest -q + + - name: Lint (flake8) + run: flake8 coderag tests + + - name: Type check (mypy) + run: mypy coderag + + - name: Tests (offline — no model downloads, no network) + run: pytest -m "not integration" diff --git a/.gitignore b/.gitignore index e4e233e..170b6b1 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,13 @@ __pycache__/ *.py[cod] +# Build artifacts +*.egg-info/ +build/ +dist/ +.pytest_cache/ +.mypy_cache/ + # Ignore virtual environment directories .venv/ env/ @@ -15,6 +22,10 @@ node_modules/ # Ignore FAISS index file *.faiss +*.faiss.kind + +# Ignore the CodeRAG store (SQLite db + index live here by default) +.coderag/ # Ignore Git directory .git/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a8c53b4..82f6f1f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,7 +20,6 @@ repos: hooks: - id: flake8 additional_dependencies: ["flake8-bugbear==24.4.26"] - args: ["--max-line-length=88", "--ignore=E203,W503"] - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.11.1 hooks: diff --git a/AGENTS.md b/AGENTS.md index 5d3b824..b94caa4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,37 +1,46 @@ # Repository Guidelines ## Project Structure & Module Organization -- `coderag/`: Core library (`config.py`, `embeddings.py`, `index.py`, `search.py`, `monitor.py`). -- `app.py`: Streamlit UI. `main.py`: backend/indexer. `prompt_flow.py`: RAG orchestration. -- `scripts/`: Utilities (e.g., `initialize_index.py`, `run_monitor.py`). -- `tests/`: Minimal checks (e.g., `test_faiss.py`). -- `example.env` → copy to `.env` for local secrets; CI lives in `.github/`. +- `coderag/api.py`: The `CodeRAG` facade — the public entry point every surface routes through. +- `coderag/config.py`, `coderag/types.py`: Immutable `Config` and shared dataclasses. +- `coderag/embeddings/`: `EmbeddingProvider` protocol + `fastembed` (default), `openai`, `fake`. +- `coderag/chunking/`: Symbol-aware chunking (`python_ast.py`, `treesitter.py`, line-window `base.py`). +- `coderag/store/`: `sqlite_store.py` (source of truth + FTS5) and `vector_index.py` (FAISS Flat/IVF cache). +- `coderag/retrieval/`: Hybrid dense + BM25 search fused with RRF. +- `coderag/indexer.py`, `coderag/watch.py`: Incremental indexing and the debounced watcher. +- `coderag/surfaces/`: `cli.py`, `http_api.py` (FastAPI), `streamlit_app.py` — thin adapters over the facade. +- `tests/`: pytest suite (offline by default via the `fake` provider; real model behind `-m integration`). +- `example.env` → copy to `.env`; CI lives in `.github/`. ## Build, Test, and Development Commands - Create env: `python -m venv venv && source venv/bin/activate`. -- Install deps: `pip install -r requirements.txt`. -- Run backend: `python main.py` (indexes and watches `WATCHED_DIR`). -- Run UI: `streamlit run app.py`. -- Quick test: `python tests/test_faiss.py` (FAISS round‑trip sanity check). -- Quality suite: `pre-commit run --all-files` (black, isort, flake8, mypy, basics). +- Install: `pip install -e ".[dev,server,openai]"` (extras: `server`, `ui`, `openai`). +- Use it: `coderag index`, `coderag search "QUERY"`, `coderag watch`, `coderag serve`, `coderag ui`, `coderag status`. +- Tests: `pytest -m "not integration"` (fast/offline) or `pytest -m integration` (real fastembed). +- Quality: `black --check . && isort --check-only . && flake8 coderag tests && mypy coderag`. ## Coding Style & Naming Conventions -- Formatting: Black (88 cols), isort profile "black"; run `black . && isort .`. -- Linting: flake8 with `--ignore=E203,W503` to match Black. -- Typing: mypy (py311 target; ignore missing imports OK). Prefer typed signatures and docstrings. -- Indentation: 4 spaces. Names: `snake_case` for files/functions, `PascalCase` for classes, constants `UPPER_SNAKE`. -- Imports: first‑party module is `coderag` (see `pyproject.toml`). +- Black (88-col code), isort profile "black". flake8 config in `.flake8` allows up to 100 cols (prose slack). +- Typing: mypy (py311 target). Prefer typed signatures and concise docstrings. +- Indentation: 4 spaces. `snake_case` functions/files, `PascalCase` classes, `UPPER_SNAKE` constants. +- First-party module is `coderag`; surfaces must stay thin — no engine logic in `surfaces/`. + +## Architecture Invariants +- SQLite is the source of truth; the FAISS index is a rebuildable cache (`rebuild_from_store`). +- `chunks.id` is the FAISS id and is `AUTOINCREMENT` (ids never reused). +- Incremental indexing is delete-before-add (no duplicate/stale vectors); unchanged files skip via content hash. +- Embedding dimension comes from the provider, not a constant; a model change triggers a rebuild. ## Testing Guidelines -- Place tests in `tests/` as `test_*.py`. Keep unit tests deterministic; mock OpenAI calls where possible. -- Run directly (`python tests/test_faiss.py`) or with pytest if available (`pytest -q`). -- Ensure `.env` or env vars provide `OPENAI_API_KEY` for integration tests; avoid hitting rate limits in CI. +- Place tests in `tests/` as `test_*.py`; keep them deterministic and offline (use the `fake` provider fixture). +- Mark anything that downloads a model or hits the network with `@pytest.mark.integration` (deselected in CI). +- Mock OpenAI; never call the network in default tests. ## Commit & Pull Request Guidelines -- Use Conventional Commits seen in history: `feat:`, `fix:`, `docs:`, `ci:`, `refactor:`, `simplify:`. -- Before pushing: `pre-commit run --all-files` and update docs when behavior changes. -- PRs: clear description, linked issues, steps to validate; include screenshots/GIFs for UI changes; note config changes (`.env`). +- Conventional Commits: `feat:`, `fix:`, `docs:`, `ci:`, `refactor:`, `test:`. +- Before pushing: run the quality gate above and update docs when behavior changes. +- PRs: clear description, validation steps, screenshots/GIFs for UI changes, note config changes (`.env`). ## Security & Configuration Tips -- Never commit secrets. Start with `cp example.env .env`; set `OPENAI_API_KEY`, `WATCHED_DIR`, `FAISS_INDEX_FILE`. -- Avoid logging sensitive data. Regenerate the FAISS index if dimensions or models change (`python scripts/initialize_index.py`). +- Never commit secrets. The default local provider needs no key; OpenAI is opt-in. +- The index/database live in `CODERAG_STORE_DIR` (default `./.coderag/`, gitignored). diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 9399e46..f78f559 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -1,203 +1,77 @@ # 🛠️ Development Guide -## Setting Up Development Environment - -### 1. Clone and Setup - -```bash -git clone https://github.com/your-username/CodeRAG.git -cd CodeRAG -python -m venv venv -source venv/bin/activate # Windows: venv\Scripts\activate -pip install -r requirements.txt - -> The requirements file delegates to `-e .[dev]`, so you can also run -> `pip install -e .[dev]` directly if you prefer editable installs. -``` - -### 2. Configure Pre-commit Hooks - -```bash -pip install pre-commit -pre-commit install -pre-commit run --all-files -``` - -This will run code quality checks on every commit: -- **Black**: Code formatting -- **isort**: Import sorting -- **Flake8**: Linting and style checks -- **MyPy**: Type checking -- **Basic hooks**: Trailing whitespace, file endings, etc. - -### 3. Environment Variables - -Copy `example.env` to `.env` and configure: +## Setup ```bash -cp example.env .env +python -m venv venv && source venv/bin/activate # Windows: venv\Scripts\activate +pip install -e ".[dev,server,openai]" # editable install + all extras +pre-commit install # optional: run quality checks on commit ``` -Required variables: -```env -OPENAI_API_KEY=your_key_here # Required for embeddings and chat -WATCHED_DIR=/path/to/code # Directory to index (default: current dir) -``` +No configuration is required to run — the default `fastembed` provider downloads a small +local model on first use. Copy `example.env` to `.env` only if you want to set defaults. -## Code Quality Standards +## Architecture -### Type Hints -All functions should have type hints: +CodeRAG is one engine (`coderag.api.CodeRAG`) behind several surfaces. The facade wires +together four swappable pieces and constructs them lazily. -```python -def process_file(filepath: str, content: str) -> Optional[np.ndarray]: - \"\"\"Process a file and return embeddings.\"\"\" - ... ``` - -### Error Handling -Use structured logging and proper exception handling: - -```python -import logging -logger = logging.getLogger(__name__) - -try: - result = risky_operation() -except SpecificError as e: - logger.error(f"Operation failed: {str(e)}") - return None +coderag/ +├── api.py # CodeRAG facade — the public entry point every surface uses +├── config.py # Immutable Config dataclass (Config.from_env) +├── types.py # Chunk, SearchHit, IndexStats +├── indexer.py # Incremental indexing: hash-diff, delete-before-add, prune +├── watch.py # Debounced filesystem watcher -> indexer +├── llm.py # Optional streamed LLM answer over retrieved chunks +├── embeddings/ # EmbeddingProvider protocol + fastembed / openai / fake +├── chunking/ # Symbol-aware chunking: python_ast, treesitter, line-window base +├── store/ # SQLite source of truth + pluggable FAISS vector index +│ ├── sqlite_store.py # files/chunks/vectors + FTS5 lexical search +│ └── vector_index.py # FaissVectorIndex: Flat (exact) / IVF (scale) +├── retrieval/ # Hybrid search: dense + BM25, fused with RRF +└── surfaces/ # cli.py · http_api.py (FastAPI) · streamlit_app.py ``` -### Documentation -Use concise docstrings for public functions: - -```python -def search_code(query: str, k: int = 5) -> List[Dict[str, Any]]: - \"\"\"Search the FAISS index using a text query. +### Design invariants (don't break these) - Args: - query: The search query text - k: Number of results to return +- **SQLite is the source of truth; FAISS is a rebuildable cache.** Vectors are stored as + BLOBs in SQLite, so `FaissVectorIndex.rebuild_from_store()` can always reconstruct the + index. `ensure_consistent()` does this automatically when counts disagree. +- **`chunks.id` is the FAISS id and is `AUTOINCREMENT`** — ids are never reused, which keeps + a stale cache from resurrecting deleted content. +- **Delete-before-add.** A changed file's old chunks are removed from both SQLite and FAISS + before new ones are added (`Indexer._index_file`). This is the bug the old `monitor.py` had. +- **The embedding dimension comes from the provider**, never a hard-coded constant. A model + change is detected via `meta.embed_dim` and triggers a clean rebuild. - Returns: - List of search results with metadata - \"\"\" -``` +## Quality gate -## Testing Your Changes +The same commands CI runs: -### Manual Testing ```bash -# Test backend indexing -python main.py - -# Test Streamlit UI (separate terminal) -streamlit run app.py +pytest -m "not integration" # fast & offline — uses the deterministic fake embedder +pytest -m integration # exercises the real fastembed model (downloads once) +black --check . && isort --check-only . +flake8 coderag tests # config in .flake8 (max-line-length 100) +mypy coderag ``` -### Code Quality Checks -```bash -pre-commit run --all-files -``` +Tests never hit the network or download a model unless marked `integration`. Use the +`config`/`repo`/`write` fixtures in `tests/conftest.py` (they default to the `fake` provider). -If you need to run a specific tool locally: +## Adding things -```bash -black . -isort . -flake8 . -mypy . -``` +- **A new embedding backend:** implement the `EmbeddingProvider` protocol + (`coderag/embeddings/__init__.py`) and wire it into `get_provider()`. +- **A new language:** add the extension in `chunking/languages.py`; for symbol-aware + chunking, add a grammar + node types in `chunking/treesitter.py` (or rely on the + line-window fallback). +- **A new surface:** keep it a thin adapter over `coderag.api.CodeRAG` — no engine logic in + surfaces. -## Adding New Features +## Conventions -1. **Create feature branch**: `git checkout -b feature/new-feature` -2. **Add logging**: Use the logger for all operations -3. **Add type hints**: Follow existing patterns -4. **Handle errors**: Graceful degradation and user-friendly messages -5. **Update tests**: Add tests for new functionality -6. **Update docs**: Update README if needed - -## Architecture Guidelines - -### Keep It Simple -- Maintain the single-responsibility principle -- Avoid unnecessary abstractions -- Focus on the core RAG functionality - -### Error Handling Strategy -- Log errors with context -- Return None/empty lists for failures -- Show user-friendly messages in UI -- Don't crash the application - -### Performance Considerations -- Limit search results (default: 5) -- Truncate long content for context -- Cache embeddings when possible -- Monitor memory usage with large codebases - -## Debugging Tips - -### Enable Debug Logging -```python -logging.basicConfig(level=logging.DEBUG) -``` - -### Check Index Status -```python -from coderag.index import inspect_metadata -inspect_metadata(5) # Show first 5 entries -``` - -### Test Embeddings -```python -from coderag.embeddings import generate_embeddings -result = generate_embeddings("test code") -print(f"Shape: {result.shape if result is not None else 'None'}") -``` - -## Common Development Issues - -**Import Errors** -- Ensure you're in the virtual environment -- Check PYTHONPATH includes project root -- Verify all dependencies are installed - -**OpenAI API Issues** -- Check API key validity -- Monitor rate limits and usage -- Test with a simple embedding request - -**FAISS Index Corruption** -- Delete existing index files and rebuild -- Check file permissions -- Ensure consistent embedding dimensions - -## Routine Maintenance - -- **Regenerate the FAISS index** after large code refactors: `python scripts/initialize_index.py`. -- **Rotate environment secrets** by updating `.env` or your deployment variables, then restarting services. -- **Refresh dependencies** with `pip install --upgrade -r requirements.txt` and run `pre-commit run --all-files` plus `pytest -q`. -- **Keep hooks current** using `pre-commit autoupdate` followed by a commit once checks pass. - -## Project Structure - -``` -CodeRAG/ -├── coderag/ # Core library -│ ├── __init__.py -│ ├── config.py # Configuration management -│ ├── embeddings.py # OpenAI integration -│ ├── index.py # FAISS operations -│ ├── search.py # Search functionality -│ └── monitor.py # File monitoring -├── scripts/ # Utility scripts -├── tests/ # Test files -├── .github/ # GitHub workflows -├── main.py # Backend service -├── app.py # Streamlit frontend -├── prompt_flow.py # RAG orchestration -└── requirements.txt # Dependencies -``` +- Conventional Commits (`feat:`, `fix:`, `docs:`, `refactor:`, `test:`). +- Black (88-col code) + isort (black profile); flake8 allows up to 100 cols for prose. +- Typed signatures and concise docstrings on public functions. diff --git a/README.md b/README.md index f0aacb2..8a18b70 100644 --- a/README.md +++ b/README.md @@ -1,203 +1,173 @@ -# 🤖 CodeRAG: AI-Powered Code Retrieval & Assistance +# 🔎 CodeRAG + +**A standalone, local-first semantic code-search engine for large and custom codebases.** [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -[![CI Tests](https://github.com/Neverdecel/CodeRAG/actions/workflows/ci-tests.yml/badge.svg?branch=master)](https://github.com/Neverdecel/CodeRAG/actions/workflows/ci-tests.yml) - -> **Note**: This POC was innovative for its time, but modern tools like Cursor and Windsurf now apply this principle directly in IDEs. This remains an excellent educational project for understanding RAG implementation. +[![CI](https://github.com/Neverdecel/CodeRAG/actions/workflows/ci-tests.yml/badge.svg?branch=master)](https://github.com/Neverdecel/CodeRAG/actions/workflows/ci-tests.yml) -## ✨ What is CodeRAG? +CodeRAG indexes a whole codebase into a hybrid (vector + keyword) search index and answers +questions like *"where is retry/backoff handled?"* with the exact functions, classes, and +files that matter — ranked by meaning, not just string match. -CodeRAG combines **Retrieval-Augmented Generation (RAG)** with AI to provide intelligent coding assistance. Instead of limited context windows, it indexes your entire codebase and provides contextual suggestions based on your complete project. +It runs **entirely on your machine with no API key** (a local ONNX embedding model is the +default), keeps its index **up to date as you edit**, and is built to stay fast on **large +codebases**. Use it from the **CLI**, embed it as a **Python library**, self-host it as an +**HTTP service**, or browse with the **web UI**. -### 🎯 Core Idea +> Built for the cases off-the-shelf IDE assistants don't cover well: a codebase that's too +> big, too private, or too custom — or a search/RAG capability you want to own and embed in +> your own tools. -Most coding assistants work with limited scope, but CodeRAG provides the full context of your project by: -- **Real-time indexing** of your entire codebase using FAISS vector search -- **Semantic code search** powered by OpenAI embeddings -- **Contextual AI responses** that understand your project structure +--- -## 🚀 Quick Start +## ✨ Highlights -### Prerequisites -- Python 3.11+ -- OpenAI API Key ([Get one here](https://platform.openai.com/api-keys)) +- **Local-first, zero-key.** Default embeddings run locally via [fastembed](https://github.com/qdrant/fastembed) (ONNX, no PyTorch). OpenAI is optional. +- **Symbol-aware chunking.** Indexes *functions, classes, and methods* (Python via `ast`; JS/TS/Go/Rust/Java via tree-sitter), not crude fixed-size blocks — so results point at real code units with `file:line` citations. +- **Hybrid retrieval.** Dense vector search **+** BM25 keyword search, fused with Reciprocal Rank Fusion. Great at both "what does this *mean*" and exact-identifier lookups. +- **Incremental & live.** Content-hashed indexing only re-embeds files that changed; a debounced watcher keeps the index current as you code. No duplicate or stale vectors. +- **Built to scale.** Exact `Flat` search for small repos, automatic switch to approximate `IVF` past a threshold so it stays fast at 100k+ chunks. +- **Four surfaces, one engine.** CLI · Python library · HTTP/REST · Streamlit UI — all thin wrappers over the same `CodeRAG` object. -### Installation +## 🚀 Quick start ```bash -# Clone the repository -git clone https://github.com/your-username/CodeRAG.git -cd CodeRAG - -# Create virtual environment -python -m venv venv -source venv/bin/activate # On Windows: venv\\Scripts\\activate - -# Install dependencies (installs the package with dev extras) -pip install -r requirements.txt - -# Configure environment -cp example.env .env -# Edit .env with your OpenAI API key and settings +pip install -e . # core engine (local embeddings included) +# optional extras: +pip install -e ".[server]" # HTTP/REST API +pip install -e ".[ui]" # Streamlit web UI +pip install -e ".[openai]" # OpenAI embeddings / LLM answers ``` -> The requirements file simply references `-e .[dev]`; feel free to run -> `pip install -e .[dev]` directly if you prefer editable installs. - -### Configuration +Index a codebase and search it — no configuration, no API key: -Create a `.env` file with your settings: +```bash +coderag index --watched-dir /path/to/your/repo +coderag search "where are duplicate vectors removed on file change" --watched-dir /path/to/your/repo +``` -```env -OPENAI_API_KEY=your_openai_api_key_here -OPENAI_EMBEDDING_MODEL=text-embedding-ada-002 -OPENAI_CHAT_MODEL=gpt-4 -WATCHED_DIR=/path/to/your/code/directory -FAISS_INDEX_FILE=./coderag_index.faiss -EMBEDDING_DIM=1536 +``` +1. coderag/indexer.py:141 (Indexer._index_file) [method, sim=0.70] + def _index_file(self, item): removed = 0; existing = self.store.get_file(item.rel) … +2. coderag/indexer.py:1 [window, sim=0.74] + """Incremental indexing orchestration. ...the critical correctness property…""" ``` -### Running CodeRAG +By default the index lives in `./.coderag/`. Set `CODERAG_WATCHED_DIR` / `CODERAG_STORE_DIR` +(or copy `example.env` to `.env`) to avoid repeating flags. -```bash -# Start the backend (indexing and monitoring) -python main.py +## 🧑‍💻 The four surfaces -# In a separate terminal, start the web interface -streamlit run app.py +### CLI -# Query the local index from the terminal (after indexing completes) -coderag-cli "how is faiss configured?" +```bash +coderag index [PATH] [--full] # build / incrementally update the index +coderag search "QUERY" [-k 8] # hybrid search; add --json or --answer +coderag watch # index, then keep it live as files change +coderag serve --port 8000 # run the HTTP API (needs [server]) +coderag ui # launch the web UI (needs [ui]) +coderag status # index stats (files, chunks, model, index type) ``` -## 📖 How It Works - -```mermaid -graph LR - A[Code Files] --> B[File Monitor] - B --> C[OpenAI Embeddings] - C --> D[FAISS Vector DB] - E[User Query] --> F[Semantic Search] - D --> F - F --> G[Retrieved Context] - G --> H[OpenAI GPT] - H --> I[AI Response] -``` +### Python library -1. **Indexing**: CodeRAG monitors your code directory and generates embeddings for Python files -2. **Storage**: Embeddings are stored in a FAISS vector database with metadata -3. **Search**: User queries are embedded and matched against the code database -4. **Generation**: Retrieved code context is sent to GPT models for intelligent responses +```python +from coderag import CodeRAG, Config -## 🛠️ Architecture +cr = CodeRAG(Config.from_env(watched_dir="/path/to/repo")) +cr.index() +for hit in cr.search("how is the FAISS index persisted?"): + print(f"{hit.location} {hit.symbol} (sim={hit.similarity:.2f})") + print(hit.text) ``` -CodeRAG/ -├── 🧠 coderag/ # Core RAG functionality -│ ├── config.py # Environment configuration -│ ├── embeddings.py # OpenAI embedding generation -│ ├── index.py # FAISS vector operations -│ ├── search.py # Semantic code search -│ └── monitor.py # File system monitoring -├── 🌐 app.py # Streamlit web interface -├── 🔧 main.py # Backend indexing service -├── 🔗 prompt_flow.py # RAG pipeline orchestration -└── 📋 requirements.txt # Dependencies -``` - -### Key Components - -- **🔍 Vector Search**: FAISS-powered similarity search for code retrieval -- **🎯 Smart Embeddings**: OpenAI embeddings capture semantic code meaning -- **📡 Real-time Updates**: Watchdog monitors file changes for live indexing -- **💬 Conversational UI**: Streamlit interface with chat-like experience -## 🎪 Usage Examples +### HTTP / REST (`coderag serve`) -### Ask About Your Code -``` -"How does the FAISS indexing work in this codebase?" -"Where is error handling implemented?" -"Show me examples of the embedding generation process" +```bash +curl "http://127.0.0.1:8000/search?q=token%20validation&k=5" +curl -X POST http://127.0.0.1:8000/index -d '{"full": false}' -H 'content-type: application/json' +curl "http://127.0.0.1:8000/status" +curl "http://127.0.0.1:8000/file?path=coderag/api.py&start_line=1&end_line=40" ``` -### Get Improvements -``` -"How can I optimize the search performance?" -"What are potential security issues in this code?" -"Suggest better error handling for the monitor module" -``` +Self-host it once and point any number of custom apps or teammates at a big shared codebase. -### Debug Issues -``` -"Why might the search return no results?" -"How do I troubleshoot OpenAI connection issues?" -"What could cause indexing to fail?" -``` +### Web UI (`coderag ui`) -## ⚙️ Development +Streamlit app: search box, retrieved chunks with `path:line` citations and similarity +scores, a one-click **Reindex** button, and an optional streamed LLM answer (when an OpenAI +key is configured). -### Code Quality Tools +## 🏗️ How it works -```bash -# Install pre-commit hooks -pip install pre-commit -pre-commit install -pre-commit run --all-files +```mermaid +graph LR + A[Source files] --> B[Symbol-aware chunking
ast / tree-sitter] + B --> C[Embeddings
fastembed · OpenAI] + C --> D[(SQLite store
chunks + vectors + FTS5)] + D --> E[FAISS index
Flat → IVF] + Q[Query] --> F[Dense + BM25] + E --> F + D --> F + F --> G[Reciprocal Rank Fusion] + G --> H[Ranked hits
path:line + score] ``` -### Testing +- **SQLite is the source of truth** (chunk text, line ranges, symbols, content hashes, and the + raw vectors). The **FAISS index is a rebuildable cache** — it can always be reconstructed + from SQLite, so switching models or index types never corrupts your data. +- Each file's content is **hashed**; unchanged files are skipped on re-index. A changed file's + old chunks are removed from *both* the store and the vector index **before** new ones are + added — so editing never accumulates stale or duplicate vectors. -```bash -# Test FAISS index functionality -python tests/test_faiss.py +## ⚙️ Configuration -# Test individual components -python scripts/initialize_index.py -python scripts/run_monitor.py -``` +Everything is configurable via `CODERAG_*` environment variables or a `.env` file (see +[`example.env`](example.env)). Common ones: -## 🐛 Troubleshooting +| Variable | Default | Meaning | +| --- | --- | --- | +| `CODERAG_PROVIDER` | `fastembed` | `fastembed` (local) · `openai` · `fake` | +| `CODERAG_MODEL` | `BAAI/bge-small-en-v1.5` | Local embedding model | +| `CODERAG_WATCHED_DIR` | cwd | Codebase to index | +| `CODERAG_STORE_DIR` | `./.coderag` | Where the DB + index live | +| `CODERAG_INDEX_TYPE` | `auto` | `auto` · `flat` · `ivf` | +| `CODERAG_IVF_THRESHOLD` | `50000` | Vectors before switching Flat → IVF | +| `CODERAG_TOP_K` | `8` | Results returned | +| `OPENAI_API_KEY` | – | Needed only for OpenAI embeddings / answers | -### Common Issues +## 🧩 Supported languages -**Search returns no results** -- Check if indexing completed: look for `coderag_index.faiss` file -- Verify OpenAI API key is working -- Ensure your query relates to indexed Python files +Symbol-aware (function/class/method level): **Python, JavaScript, TypeScript/TSX, Go, Rust, +Java**. Many other languages and docs (C/C++, Ruby, PHP, Markdown, YAML, …) are indexed with +a line-window fallback, so they remain searchable. -**OpenAI API errors** -- Verify API key in `.env` file -- Check API usage limits and billing -- Ensure model names are correct (gpt-4, text-embedding-ada-002) +## 🛠️ Development -**File monitoring not working** -- Check `WATCHED_DIR` path in `.env` -- Ensure directory contains `.py` files -- Look for error logs in console output +```bash +python -m venv venv && source venv/bin/activate +pip install -e ".[dev,server,openai]" -## 🤝 Contributing +pytest -m "not integration" # fast, offline (uses a deterministic fake embedder) +pytest -m integration # exercises the real local model (downloads once) +black --check . && isort --check-only . && flake8 coderag tests && mypy coderag +``` -1. Fork the repository -2. Create a feature branch (`git checkout -b feature/amazing-feature`) -3. Make your changes with proper error handling and type hints -4. Run code quality checks (`pre-commit run --all-files`) -5. Commit your changes (`git commit -m 'Add amazing feature'`) -6. Push to the branch (`git push origin feature/amazing-feature`) -7. Open a Pull Request +See [DEVELOPMENT.md](DEVELOPMENT.md) and [AGENTS.md](AGENTS.md) for architecture and +contribution details. ## 📄 License -This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE-2.0.txt) file for details. +Apache License 2.0 — see [LICENSE](LICENSE-2.0.txt). ## 🙏 Acknowledgments -- [OpenAI](https://openai.com/) for embedding and chat models -- [Facebook AI Similarity Search (FAISS)](https://github.com/facebookresearch/faiss) for vector search -- [Streamlit](https://streamlit.io/) for the web interface -- [Watchdog](https://github.com/gorakhargosh/watchdog) for file monitoring +[FAISS](https://github.com/facebookresearch/faiss) · [fastembed](https://github.com/qdrant/fastembed) · +[tree-sitter](https://tree-sitter.github.io/tree-sitter/) · [FastAPI](https://fastapi.tiangolo.com/) · +[Streamlit](https://streamlit.io/) · [watchdog](https://github.com/gorakhargosh/watchdog) --- -**⭐ If this project helps you, please give it a star!** +**⭐ If CodeRAG helps you, please give it a star!** diff --git a/example.env b/example.env index dab4d58..7f4b360 100644 --- a/example.env +++ b/example.env @@ -1,11 +1,30 @@ -# OpenAI API Configuration -OPENAI_API_KEY=sk-1234567890abcdefghijklmnopqrstuvwxyz1234 -OPENAI_EMBEDDING_MODEL=text-embedding-ada-002 -OPENAI_CHAT_MODEL=gpt-4 +# CodeRAG configuration. Copy to .env and adjust. All values are optional — +# CodeRAG runs out of the box with a local model and no API key. -# Project Directory Configuration -WATCHED_DIR=/home/user/projects/my_codebase +# --- Embedding backend --- +# Provider: fastembed (local, default) | openai | fake +CODERAG_PROVIDER=fastembed +# Local embedding model (fastembed). 384-dim, no API key required. +CODERAG_MODEL=BAAI/bge-small-en-v1.5 +# Where downloaded local models are cached. +# CODERAG_CACHE_DIR=~/.cache/coderag -# FAISS Configuration -FAISS_INDEX_FILE=/home/user/projects/coderag/faiss_index.bin -EMBEDDING_DIM=1536 +# --- Locations --- +# The codebase to index/search (defaults to the current directory). +CODERAG_WATCHED_DIR=/path/to/your/codebase +# Where the index + database are stored (defaults to ./.coderag). +# CODERAG_STORE_DIR=./.coderag + +# --- Vector index (scale) --- +# auto | flat | ivf. "auto" uses exact Flat search and switches to approximate +# IVF automatically once the corpus grows past CODERAG_IVF_THRESHOLD vectors. +# CODERAG_INDEX_TYPE=auto +# CODERAG_IVF_THRESHOLD=50000 + +# --- Retrieval --- +# CODERAG_TOP_K=8 + +# --- Optional: OpenAI (only needed for `--provider openai` or LLM answers) --- +# OPENAI_API_KEY=sk-... +# CODERAG_OPENAI_MODEL=text-embedding-3-small +# CODERAG_CHAT_MODEL=gpt-4o-mini diff --git a/pyproject.toml b/pyproject.toml index 2d87b58..4aeee7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,23 +4,41 @@ build-backend = "setuptools.build_meta" [project] name = "coderag" -version = "0.1.0" -description = "Lean proof-of-concept retrieval-augmented assistant for codebases." +version = "1.0.0" +description = "Standalone, local-first semantic code-search engine for large and custom codebases." readme = "README.md" requires-python = ">=3.11" license = { file = "LICENSE-2.0.txt" } authors = [{ name = "Neverdecel" }] +keywords = ["code-search", "rag", "embeddings", "faiss", "semantic-search", "retrieval"] dependencies = [ - "faiss-cpu>=1.8.0.post1,<1.9", + "faiss-cpu>=1.8.0.post1,<1.10", "numpy>=1.26,<2", - "openai>=1.44,<2", "python-dotenv>=1.0,<2", - "streamlit>=1.38,<2", "tenacity>=8.5,<9", "watchdog>=4.0,<5", + "fastembed>=0.3,<1", + "tree-sitter>=0.23,<0.26", + "tree-sitter-python>=0.23", + "tree-sitter-javascript>=0.23", + "tree-sitter-typescript>=0.23", + "tree-sitter-go>=0.23", + "tree-sitter-rust>=0.23", + "tree-sitter-java>=0.23", + "tqdm>=4.66", ] [project.optional-dependencies] +server = [ + "fastapi>=0.110,<1", + "uvicorn[standard]>=0.29", +] +ui = [ + "streamlit>=1.38,<2", +] +openai = [ + "openai>=1.44,<2", +] dev = [ "black>=24.8", "flake8>=7.1", @@ -28,10 +46,21 @@ dev = [ "mypy>=1.11", "pre-commit>=3.7", "pytest>=8.3", + "pytest-cov>=5.0", + "httpx>=0.27", ] [project.scripts] -coderag-cli = "coderag.cli:main" +coderag = "coderag.surfaces.cli:main" + +[tool.setuptools.packages.find] +include = ["coderag*"] + +[tool.pytest.ini_options] +markers = [ + "integration: tests that download models or hit the network (deselected in CI)", +] +addopts = "-q" [tool.black] line-length = 88 @@ -45,6 +74,7 @@ extend-exclude = ''' | \.mypy_cache | \.tox | \.venv + | venv | build | dist | env @@ -64,3 +94,4 @@ disallow_untyped_defs = false warn_unused_ignores = true warn_redundant_casts = true check_untyped_defs = true +exclude = ['venv/', 'build/'] diff --git a/readme.rst b/readme.rst deleted file mode 100644 index ffbf749..0000000 --- a/readme.rst +++ /dev/null @@ -1,134 +0,0 @@ -Important Note -================== - -This POC was nice for it's time. However tools like Cursor and Windsurf are now applying this principle embedded in the IDE. - -Project Motivation -================== - -This project came from a simple idea: what if you could provide an entire codebase to an LLM instead of just small pieces? -Most coding assistants, like co-pilots, work on a limited scope, but I wanted something that could handle the full context of a project. - -By integrating the full codebase with Retrieval-Augmented Generation (RAG), this POC aims to improve the quality and relevance of -code suggestions. The goal is to see how having the complete code available for real-time querying can enhance productivity. - -CodeRAG -======= -CodeRAG is an AI-powered code retrieval and augmentation tool that leverages OpenAI's models (such as ``gpt-4`` or ``gpt-3.5-turbo``) for real-time codebase querying, indexing, and improvement. This project integrates a Retrieval-Augmented Generation (RAG) system to help developers seamlessly search through code, receive suggestions, and implement improvements. - -Features --------- - -- **Real-time Codebase Indexing**: Automatically indexes code files upon changes, with real-time updates. -- **Vector Database Search**: Utilizes FAISS or a similar vector database for fast, efficient code search using embeddings. -- **Conversational Coding Assistance**: Integrates OpenAI's GPT models to provide contextual code suggestions, improvements, and fixes. -- **Configurable Settings**: Environment-specific settings are managed using a ``.env`` file for API keys, model selection, and directories. - -Tech Stack ----------- - -- **OpenAI API**: Leverages GPT-4o (or any other OpenAI model) for conversational and coding improvements. -- **Python**: Core functionality and API interactions. -- **FAISS (Facebook AI Similarity Search)**: For vector-based searching. -- **python-dotenv**: For managing environment variables. -- **Retrieval-Augmented Generation (RAG)**: Combines search and generative models. - -Setup Instructions ------------------- - -Prerequisites -^^^^^^^^^^^^^ - -- **Python 3.8+** -- **OpenAI API Key** (You can get one `here `_) -- **FAISS** - -Step 1: Clone the Repository -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: bash - - git clone https://github.com/yourusername/CodeRAG.git - cd CodeRAG - -Step 2: Install Dependencies -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Create a virtual environment (recommended): - -.. code-block:: bash - - python3 -m venv venv - source venv/bin/activate # On Windows use `venv\Scripts\activate` - -Install required packages: - -.. code-block:: bash - - pip install -r requirements.txt - -Step 3: Configure Environment Variables -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Create a ``.env`` file in the root of the project and add the following variables: - -.. code-block:: bash - - OPENAI_API_KEY=your_openai_api_key - OPENAI_EMBEDDING_MODEL=text-embedding-ada-002 - OPENAI_CHAT_MODEL=gpt-4o - WATCHED_DIR=path_to_your_code_directory - FAISS_INDEX_FILE=path_to_faiss_index - EMBEDDING_DIM=1536 # Modify if you're using a different embedding model - -Step 4: Run the Application -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -1. **Start the Backend**: - - To start the backend (indexing, embeddings, and monitoring): - - .. code-block:: bash - - python main.py - -2. **Start the Frontend**: - - To launch the Streamlit UI: - - .. code-block:: bash - - streamlit run app.py - -Usage ------ - -1. **Ask a Question**: Type your question or code request into the interface. The model will search the indexed codebase and provide suggestions or improvements. -2. **Review Suggestions**: You'll receive a merged or fixed version of the code based on the model's analysis. -3. **Conversational History**: The system keeps track of your queries and the AI responses for better context in future interactions. - -Project Structure ------------------ - -- ``main.py``: The main script to run the application. -- ``prompt_flow.py``: Handles querying OpenAI's API and manages the search and conversational history. -- ``coderag/config.py``: Stores configuration and environment variables. -- ``coderag/search.py``: Manages vector database (FAISS) searches for relevant code snippets. -- ``.env``: Holds environment-specific settings (OpenAI API keys, model configuration, etc.). -- ``requirements.txt``: Lists the Python dependencies needed to run the project. - -Contributing ------------- - -Feel free to fork this repository, open issues, and submit pull requests. - -1. Fork the repository. -2. Create your feature branch (``git checkout -b feature/your-feature``). -3. Commit your changes (``git commit -am 'Add new feature'``). -4. Push to the branch (``git push origin feature/your-feature``). -5. Open a pull request. - -License -------- - -This project is licensed under the Apache License. See the LICENSE file for details. diff --git a/requirements.txt b/requirements.txt index aefbcb6..5d985d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ --e .[dev] +-e .[dev,server,ui,openai] From 7eb69112769344a972071b50575ca56731143c42 Mon Sep 17 00:00:00 2001 From: fastsoab Date: Mon, 1 Jun 2026 20:04:36 +0200 Subject: [PATCH 5/5] ci: fix test collection under bare pytest (pythonpath) CI runs 'pytest' directly (not 'python -m pytest'), so the repo root wasn't on sys.path and 'from tests.conftest import ...' failed with ModuleNotFoundError. Add pythonpath=['.'] to the pytest config so it resolves regardless of invocation. Co-Authored-By: Claude Opus 4.8 (1M context) --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 4aeee7e..cb238ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,9 @@ markers = [ "integration: tests that download models or hit the network (deselected in CI)", ] addopts = "-q" +# Put the repo root on sys.path so `from tests.conftest import ...` resolves under a +# bare `pytest` invocation (not just `python -m pytest`). +pythonpath = ["."] [tool.black] line-length = 88