From d10bc68748964b6e8e66889bad868dd86e9e5e64 Mon Sep 17 00:00:00 2001 From: stephantul Date: Wed, 15 Apr 2026 14:54:14 +0200 Subject: [PATCH] simplify code --- src/semble/index/create.py | 37 +++++++++------------------- src/semble/index/index.py | 50 +++++++++++++++----------------------- tests/test_index.py | 9 +------ 3 files changed, 32 insertions(+), 64 deletions(-) diff --git a/src/semble/index/create.py b/src/semble/index/create.py index 6a45ec9..76ac6a3 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -11,7 +11,7 @@ from semble.index.dense import embed_chunks from semble.index.sparse import enrich_for_bm25 from semble.tokens import tokenize -from semble.types import Chunk, Encoder, IndexStats +from semble.types import Chunk, Encoder def create_index_from_path( @@ -21,7 +21,7 @@ def create_index_from_path( ignore: frozenset[str] | None = None, include_docs: bool = False, display_root: Path | None = None, -) -> tuple[bm25s.BM25, Vicinity, list[Chunk], IndexStats, Path]: +) -> tuple[bm25s.BM25, Vicinity, list[Chunk]]: """Create an index from a resolved directory, optionally storing chunk paths relative to display_root. :param path: Resolved absolute path to index. @@ -31,43 +31,28 @@ def create_index_from_path( :param include_docs: If True, also index documentation files. :param display_root: If set, chunk file paths are stored relative to this root. :raises ValueError: if no items were found, no index can be created. - :return: Statistics about the indexed files and chunks. + :return: A bm25 index, vicinity index and list of chunks """ - index_root = display_root or path extensions = filter_extensions(extensions, include_docs=include_docs) - all_chunks: list[Chunk] = [] - language_counts: dict[str, int] = {} - indexed_files = 0 + chunks: list[Chunk] = [] for file_path in walk_files(path, extensions, ignore): language = language_for_path(file_path) with contextlib.suppress(OSError): source = file_path.read_text(encoding="utf-8", errors="replace") - indexed_files += 1 - chunk_path = str(file_path.relative_to(display_root)) if display_root else str(file_path) - file_chunks = chunk_source(source, chunk_path, language) - all_chunks.extend(file_chunks) - for chunk in file_chunks: - if chunk.language: - language_counts[chunk.language] = language_counts.get(chunk.language, 0) + 1 + chunk_path = file_path.relative_to(display_root) if display_root else file_path + chunks.extend(chunk_source(source, str(chunk_path), language)) - chunks = all_chunks - - if all_chunks: - embeddings = embed_chunks(model, all_chunks) + if chunks: + embeddings = embed_chunks(model, chunks) bm25_index = bm25s.BM25() bm25_index.index( - [tokenize(enrich_for_bm25(chunk, index_root)) for chunk in all_chunks], + [tokenize(enrich_for_bm25(chunk, display_root or path)) for chunk in chunks], show_progress=False, ) - semantic_index = Vicinity.from_vectors_and_items(embeddings, all_chunks, metric=Metric.COSINE) + semantic_index = Vicinity.from_vectors_and_items(embeddings, chunks, metric=Metric.COSINE) else: raise ValueError("Unable to create index.") - stats = IndexStats( - indexed_files=indexed_files, - total_chunks=len(all_chunks), - languages=language_counts, - ) - return bm25_index, semantic_index, chunks, stats, index_root + return bm25_index, semantic_index, chunks diff --git a/src/semble/index/index.py b/src/semble/index/index.py index b9bc27a..cb3e653 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -2,6 +2,7 @@ import subprocess import tempfile +from collections import defaultdict from pathlib import Path from bm25s import BM25 @@ -34,11 +35,24 @@ def __init__( """ self.model: Encoder = model self.chunks: list[Chunk] = chunks - self.stats = IndexStats() self._bm25_index: BM25 = bm25_index self._semantic_index: Vicinity = semantic_index self._index_root: Path = index_root + @property + def stats(self) -> IndexStats: + """Stats of an index.""" + indexed_files = set() + total_chunks = len(self.chunks) + language_counts: dict[str, int] = defaultdict(int) + + for chunk in self.chunks: + indexed_files.add(chunk.file_path) + if chunk.language: + language_counts[chunk.language] += 1 + + return IndexStats(indexed_files=len(indexed_files), total_chunks=total_chunks, languages=dict(language_counts)) + @classmethod def from_path( cls, @@ -58,12 +72,12 @@ def from_path( :return: An indexed SembleIndex. """ model = model or load_model() - bm25, vicinity, chunks, stats, index_root = cls.index( + path = Path(path) + bm25, vicinity, chunks = create_index_from_path( path, model=model, extensions=extensions, ignore=ignore, include_docs=include_docs ) - index = SembleIndex(model, bm25, vicinity, chunks, index_root) - index.stats = stats + index = SembleIndex(model, bm25, vicinity, chunks, path) return index @@ -98,42 +112,18 @@ def from_git( raise RuntimeError(f"git clone failed for {url!r}:\n{result.stderr.strip()}") model = model or load_model() resolved_path = Path(tmp_dir).resolve() - bm25, vicinity, chunks, stats, index_root = create_index_from_path( + bm25, vicinity, chunks = create_index_from_path( resolved_path, model=model, extensions=extensions, ignore=ignore, include_docs=include_docs, - display_root=resolved_path, ) - index = SembleIndex(model, bm25, vicinity, chunks, index_root) - index.stats = stats + index = SembleIndex(model, bm25, vicinity, chunks, resolved_path) return index - @classmethod - def index( - cls, - path: str | Path, - model: Encoder, - extensions: frozenset[str] | None = None, - ignore: frozenset[str] | None = None, - include_docs: bool = False, - ) -> tuple[BM25, Vicinity, list[Chunk], IndexStats, Path]: - """Index a directory using the backend configured at construction time. - - :param path: Root directory to index. - :param model: The model used to index. - :param extensions: File extensions to include. - :param ignore: Directory names to skip. - :param include_docs: If True, also index documentation files. - :return: Statistics about the indexed files and chunks. - """ - return create_index_from_path( - Path(path).resolve(), model, extensions=extensions, ignore=ignore, include_docs=include_docs - ) - def find_related(self, file_path: str, line: int, top_k: int = 5) -> list[SearchResult]: """Return chunks semantically similar to the chunk at the given file location. diff --git a/tests/test_index.py b/tests/test_index.py index b61a14c..62906b3 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -16,13 +16,6 @@ def indexed_index(mock_model: Any, tmp_project: Path) -> SembleIndex: return SembleIndex.from_path(tmp_project, model=mock_model) -def test_index_returns_stats(mock_model: Encoder, tmp_project: Path) -> None: - """Indexing returns stats with file and chunk counts populated.""" - _, _, _, stats, _ = create_index_from_path(tmp_project, mock_model) - assert stats.indexed_files >= 2 # auth.py, utils.py - assert stats.total_chunks > 0 - - @pytest.mark.parametrize( ("include_docs", "md_in_results"), [(False, False), (True, True)], @@ -31,7 +24,7 @@ def test_index_markdown_inclusion( mock_model: Encoder, tmp_project: Path, include_docs: bool, md_in_results: bool ) -> None: """Markdown files are excluded by default and included when include_docs=True.""" - _, _, chunks, _, _ = create_index_from_path(tmp_project, mock_model, include_docs=include_docs) + _, _, chunks = create_index_from_path(tmp_project, mock_model, include_docs=include_docs) has_md = ".md" in {Path(c.file_path).suffix for c in chunks} assert has_md is md_in_results