Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 11 additions & 26 deletions src/semble/index/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from semble.index.dense import embed_chunks
from semble.index.sparse import enrich_for_bm25
from semble.tokens import tokenize
from semble.types import Chunk, Encoder, IndexStats
from semble.types import Chunk, Encoder


def create_index_from_path(
Expand All @@ -21,7 +21,7 @@ def create_index_from_path(
ignore: frozenset[str] | None = None,
include_docs: bool = False,
display_root: Path | None = None,
) -> tuple[bm25s.BM25, Vicinity, list[Chunk], IndexStats, Path]:
) -> tuple[bm25s.BM25, Vicinity, list[Chunk]]:
"""Create an index from a resolved directory, optionally storing chunk paths relative to display_root.

:param path: Resolved absolute path to index.
Expand All @@ -31,43 +31,28 @@ def create_index_from_path(
:param include_docs: If True, also index documentation files.
:param display_root: If set, chunk file paths are stored relative to this root.
:raises ValueError: if no items were found, no index can be created.
:return: Statistics about the indexed files and chunks.
:return: A bm25 index, vicinity index and list of chunks
"""
index_root = display_root or path
extensions = filter_extensions(extensions, include_docs=include_docs)

all_chunks: list[Chunk] = []
language_counts: dict[str, int] = {}
indexed_files = 0
chunks: list[Chunk] = []

for file_path in walk_files(path, extensions, ignore):
language = language_for_path(file_path)
with contextlib.suppress(OSError):
source = file_path.read_text(encoding="utf-8", errors="replace")
indexed_files += 1
chunk_path = str(file_path.relative_to(display_root)) if display_root else str(file_path)
file_chunks = chunk_source(source, chunk_path, language)
all_chunks.extend(file_chunks)
for chunk in file_chunks:
if chunk.language:
language_counts[chunk.language] = language_counts.get(chunk.language, 0) + 1
chunk_path = file_path.relative_to(display_root) if display_root else file_path
chunks.extend(chunk_source(source, str(chunk_path), language))

chunks = all_chunks

if all_chunks:
embeddings = embed_chunks(model, all_chunks)
if chunks:
embeddings = embed_chunks(model, chunks)
bm25_index = bm25s.BM25()
bm25_index.index(
[tokenize(enrich_for_bm25(chunk, index_root)) for chunk in all_chunks],
[tokenize(enrich_for_bm25(chunk, display_root or path)) for chunk in chunks],
show_progress=False,
)
semantic_index = Vicinity.from_vectors_and_items(embeddings, all_chunks, metric=Metric.COSINE)
semantic_index = Vicinity.from_vectors_and_items(embeddings, chunks, metric=Metric.COSINE)
else:
raise ValueError("Unable to create index.")

stats = IndexStats(
indexed_files=indexed_files,
total_chunks=len(all_chunks),
languages=language_counts,
)
return bm25_index, semantic_index, chunks, stats, index_root
return bm25_index, semantic_index, chunks
50 changes: 20 additions & 30 deletions src/semble/index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import subprocess
import tempfile
from collections import defaultdict
from pathlib import Path

from bm25s import BM25
Expand Down Expand Up @@ -34,11 +35,24 @@ def __init__(
"""
self.model: Encoder = model
self.chunks: list[Chunk] = chunks
self.stats = IndexStats()
self._bm25_index: BM25 = bm25_index
self._semantic_index: Vicinity = semantic_index
self._index_root: Path = index_root

@property
def stats(self) -> IndexStats:
"""Stats of an index."""
indexed_files = set()
total_chunks = len(self.chunks)
language_counts: dict[str, int] = defaultdict(int)

for chunk in self.chunks:
indexed_files.add(chunk.file_path)
if chunk.language:
language_counts[chunk.language] += 1

return IndexStats(indexed_files=len(indexed_files), total_chunks=total_chunks, languages=dict(language_counts))

@classmethod
def from_path(
cls,
Expand All @@ -58,12 +72,12 @@ def from_path(
:return: An indexed SembleIndex.
"""
model = model or load_model()
bm25, vicinity, chunks, stats, index_root = cls.index(
path = Path(path)
bm25, vicinity, chunks = create_index_from_path(
path, model=model, extensions=extensions, ignore=ignore, include_docs=include_docs
)

index = SembleIndex(model, bm25, vicinity, chunks, index_root)
index.stats = stats
index = SembleIndex(model, bm25, vicinity, chunks, path)

return index

Expand Down Expand Up @@ -98,42 +112,18 @@ def from_git(
raise RuntimeError(f"git clone failed for {url!r}:\n{result.stderr.strip()}")
model = model or load_model()
resolved_path = Path(tmp_dir).resolve()
bm25, vicinity, chunks, stats, index_root = create_index_from_path(
bm25, vicinity, chunks = create_index_from_path(
resolved_path,
model=model,
extensions=extensions,
ignore=ignore,
include_docs=include_docs,
display_root=resolved_path,
)

index = SembleIndex(model, bm25, vicinity, chunks, index_root)
index.stats = stats
index = SembleIndex(model, bm25, vicinity, chunks, resolved_path)

return index

@classmethod
def index(
cls,
path: str | Path,
model: Encoder,
extensions: frozenset[str] | None = None,
ignore: frozenset[str] | None = None,
include_docs: bool = False,
) -> tuple[BM25, Vicinity, list[Chunk], IndexStats, Path]:
"""Index a directory using the backend configured at construction time.

:param path: Root directory to index.
:param model: The model used to index.
:param extensions: File extensions to include.
:param ignore: Directory names to skip.
:param include_docs: If True, also index documentation files.
:return: Statistics about the indexed files and chunks.
"""
return create_index_from_path(
Path(path).resolve(), model, extensions=extensions, ignore=ignore, include_docs=include_docs
)

def find_related(self, file_path: str, line: int, top_k: int = 5) -> list[SearchResult]:
"""Return chunks semantically similar to the chunk at the given file location.

Expand Down
9 changes: 1 addition & 8 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,6 @@ def indexed_index(mock_model: Any, tmp_project: Path) -> SembleIndex:
return SembleIndex.from_path(tmp_project, model=mock_model)


def test_index_returns_stats(mock_model: Encoder, tmp_project: Path) -> None:
"""Indexing returns stats with file and chunk counts populated."""
_, _, _, stats, _ = create_index_from_path(tmp_project, mock_model)
assert stats.indexed_files >= 2 # auth.py, utils.py
assert stats.total_chunks > 0


@pytest.mark.parametrize(
("include_docs", "md_in_results"),
[(False, False), (True, True)],
Expand All @@ -31,7 +24,7 @@ def test_index_markdown_inclusion(
mock_model: Encoder, tmp_project: Path, include_docs: bool, md_in_results: bool
) -> None:
"""Markdown files are excluded by default and included when include_docs=True."""
_, _, chunks, _, _ = create_index_from_path(tmp_project, mock_model, include_docs=include_docs)
_, _, chunks = create_index_from_path(tmp_project, mock_model, include_docs=include_docs)
has_md = ".md" in {Path(c.file_path).suffix for c in chunks}
assert has_md is md_in_results

Expand Down