Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions src/semble/index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ def from_path(
:return: An indexed SembleIndex. Chunk file paths are relative to ``path``.
:raises FileNotFoundError: If `path` does not exist.
:raises NotADirectoryError: If `path` exists but is not a directory.
:raises ValueError: If `path` is a directory but contains no supported files.
"""
model = model or load_model()
path = Path(path)
Expand Down Expand Up @@ -172,10 +171,7 @@ def find_related(self, file_path: str, line: int, top_k: int = 5) -> list[Search
)
if target is None:
return []
if target.language:
selector = self._get_selector_vector(filter_languages=[target.language])
else:
selector = None
selector = self._get_selector_vector(filter_languages=[target.language]) if target.language else None
results = search_semantic(target.content, self.model, self._semantic_index, self.chunks, top_k + 1, selector)
return [r for r in results if r.chunk != target][:top_k]

Expand Down
2 changes: 1 addition & 1 deletion src/semble/mcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex:
task = self._tasks[cache_key]
try:
return await asyncio.shield(task)
except asyncio.CancelledError:
except asyncio.CancelledError: # pragma: no cover
# If this waiter was cancelled but the task is still running, preserve it for
# other waiters. Only evict if the task itself was cancelled.
if task.done():
Expand Down
3 changes: 0 additions & 3 deletions src/semble/ranking/boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,6 @@ def _boost_symbol_definitions(
) -> None:
"""Boost chunks that define the queried symbol, scanning candidates and stem-matched non-candidates (in-place)."""
symbol_name = _extract_symbol_name(query)
if not symbol_name:
return

names = {symbol_name}
if symbol_name != query.strip():
names.add(query.strip())
Expand Down
6 changes: 0 additions & 6 deletions src/semble/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ class SearchResult:
score: float
source: SearchMode

def __str__(self) -> str:
"""Return a human-readable summary of the result."""
header = f"{self.chunk.location} score={self.score:.3f}"
separator = "-" * len(header)
return f"{header}\n{separator}\n{self.chunk.content.strip()}\n"


@dataclass(frozen=True, slots=True)
class IndexStats:
Expand Down
122 changes: 66 additions & 56 deletions tests/test_chunker.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,85 @@
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from semble.index.chunker import _chunk_with_chonkie, chunk_file, chunk_lines
from semble.index.chunker import chunk_file, chunk_lines, chunk_source
from semble.index.file_walker import filter_extensions


def test_chunk_lines_basic(tmp_path: Path) -> None:
"""Chunks are produced with non-empty content."""
f = tmp_path / "test.py"
f.write_text("\n".join(f"line {i}" for i in range(10)))
chunks = chunk_lines(f.read_text(), str(f), "python", max_lines=5, overlap_lines=1)
assert len(chunks) >= 2
for c in chunks:
assert c.content.strip()


def test_chunk_lines_empty(tmp_path: Path) -> None:
"""Empty source produces no chunks."""
f = tmp_path / "empty.py"
f.write_text("")
chunks = chunk_lines("", str(f), "python")
assert chunks == []
def test_chunk_lines() -> None:
"""chunk_lines: empty input → []; real input → non-empty chunks starting at line 1."""
assert chunk_lines("", "empty.py", "python") == []


def test_chunk_lines_line_numbers(tmp_path: Path) -> None:
"""First chunk starts at line 1."""
content = "a\nb\nc\nd\ne\n"
f = tmp_path / "t.py"
chunks = chunk_lines(content, str(f), "python", max_lines=3, overlap_lines=0)
content = "\n".join(f"line {i}" for i in range(10))
chunks = chunk_lines(content, "test.py", "python", max_lines=5, overlap_lines=1)
assert len(chunks) >= 2
assert all(c.content.strip() for c in chunks)
assert chunks[0].start_line == 1


def test_chunk_file_nonexistent() -> None:
"""Non-existent file returns empty list without raising."""
chunks = chunk_file(Path("/nonexistent/file.py"))
assert chunks == []
@pytest.mark.parametrize(
("filename", "content"),
[
(None, None), # nonexistent path
("empty.py", " \n\n "), # whitespace-only
("file.xyz", "hello world\n" * 5), # unknown extension
],
ids=["nonexistent", "whitespace_only", "unknown_extension"],
)
def test_chunk_file_edge_cases_return_list(tmp_path: Path, filename: str | None, content: str | None) -> None:
"""chunk_file returns a list (usually empty) for missing / empty / unknown-type files without raising."""
if filename is None:
target = Path("/nonexistent/file.py")
else:
target = tmp_path / filename
assert content is not None
target.write_text(content)
chunks = chunk_file(target)
assert isinstance(chunks, list)


def test_chunk_file_empty(tmp_path: Path) -> None:
"""Whitespace-only file returns no chunks."""
f = tmp_path / "empty.py"
f.write_text(" \n\n ")
chunks = chunk_file(f)
assert chunks == []
def test_chunk_file_py_produces_sorted_chunks(tmp_py_file: Path) -> None:
"""Python file with functions produces at least one chunk in ascending start-line order."""
pytest.importorskip("tree_sitter_python")
chunks = chunk_file(tmp_py_file)
assert len(chunks) >= 1
start_lines = [c.start_line for c in chunks]
assert start_lines == sorted(start_lines)


def test_chunk_with_chonkie_fallback(tmp_path: Path) -> None:
"""Should fall back to line-based when given an unsupported language."""
f = tmp_path / "code.py"
f.write_text("def foo():\n pass\n")
chunks = _chunk_with_chonkie(f.read_text(), str(f), "python")
def _whitespace_chunker() -> MagicMock:
whitespace_chunk = MagicMock(text=" \n", start_index=0, end_index=0)
chunker = MagicMock()
chunker.chunk.return_value = [whitespace_chunk]
return chunker


@pytest.mark.parametrize(
"codechunker_patch",
[
{"side_effect": Exception("boom")}, # raises
{"return_value": MagicMock(chunk=MagicMock(return_value=[]))}, # empty result
{"return_value": _whitespace_chunker()}, # whitespace-only chunks
],
ids=["raises", "empty", "whitespace_only"],
)
def test_chunk_source_falls_back_when_chonkie_unusable(codechunker_patch: dict) -> None:
"""chunk_source falls back to line-based chunking when chonkie fails or yields nothing usable."""
source = "def foo():\n pass\n"
with patch("semble.index.chunker.CodeChunker", **codechunker_patch):
chunks = chunk_source(source, "foo.py", "python")
assert len(chunks) > 0
assert all(c.content.strip() for c in chunks)


def test_chunk_file_py_produces_chunks(tmp_py_file: Path) -> None:
"""Python file with functions is split into at least one chunk."""
chunks = chunk_file(tmp_py_file)
assert len(chunks) >= 1


def test_chunk_file_sorted_by_line(tmp_py_file: Path) -> None:
"""Chunks are returned in ascending start-line order."""
pytest.importorskip("tree_sitter_python")
chunks = chunk_file(tmp_py_file)
start_lines = [c.start_line for c in chunks]
assert start_lines == sorted(start_lines)
def test_chunk_source_empty_string() -> None:
"""chunk_source returns [] for whitespace-only input."""
assert chunk_source(" \n\n", "foo.py", "python") == []


def test_chunk_file_unknown_extension(tmp_path: Path) -> None:
"""Unknown file extension returns a list without raising."""
f = tmp_path / "file.xyz"
f.write_text("hello world\n" * 5)
chunks = chunk_file(f)
assert isinstance(chunks, list)
def test_filter_extensions_explicit() -> None:
"""filter_extensions returns the provided set unchanged when extensions is not None."""
explicit: frozenset[str] = frozenset({".py", ".ts"})
result = filter_extensions(explicit, include_text_files=False)
assert result == explicit
74 changes: 41 additions & 33 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import subprocess
from pathlib import Path
from typing import Any
from unittest.mock import patch

import pytest

Expand Down Expand Up @@ -59,33 +60,14 @@ def test_search_invalid_mode(indexed_index: SembleIndex) -> None:
indexed_index.search("query", mode="invalid")


def test_search_top_k_respected(indexed_index: SembleIndex) -> None:
"""Results never exceed the requested top_k."""
results = indexed_index.search("function", top_k=1, mode="bm25")
assert len(results) <= 1
def test_search_constraints(indexed_index: SembleIndex) -> None:
"""search: top_k is respected; no duplicate chunks are returned."""
assert len(indexed_index.search("function", top_k=1, mode="bm25")) <= 1


def test_search_no_duplicate_chunks(indexed_index: SembleIndex) -> None:
"""Each result chunk appears at most once in the result list."""
results = indexed_index.search("authenticate", top_k=5)
assert len(results) == len(set(r.chunk for r in results))


def test_find_related_returns_similar_chunks(indexed_index: SembleIndex) -> None:
"""find_related returns semantically similar chunks for a known file location."""
chunk = indexed_index.chunks[0]
results = indexed_index.find_related(chunk.file_path, chunk.start_line, top_k=3)
assert isinstance(results, list)
assert all(r.chunk != chunk for r in results)
assert len(results) <= 3


def test_find_related_unknown_file_returns_empty(indexed_index: SembleIndex) -> None:
"""find_related returns an empty list when the file is not in the index."""
results = indexed_index.find_related("/does/not/exist.py", 1)
assert results == []


@pytest.mark.parametrize("mode", ["bm25", "hybrid", "semantic"])
def test_search_with_filter_paths_does_not_crash(indexed_index: SembleIndex, mode: str) -> None:
"""Filtered search works regardless of where the selected chunk lives in the corpus."""
Expand All @@ -101,6 +83,17 @@ def test_search_empty_query_returns_empty(indexed_index: SembleIndex, mode: str,
assert indexed_index.search(query, mode=mode) == []


def test_find_related(indexed_index: SembleIndex) -> None:
"""find_related: returns similar chunks for a known location; returns [] for an unknown file."""
chunk = indexed_index.chunks[0]
results = indexed_index.find_related(chunk.file_path, chunk.start_line, top_k=3)
assert isinstance(results, list)
assert all(r.chunk != chunk for r in results)
assert len(results) <= 3

assert indexed_index.find_related("/does/not/exist.py", 1) == []


_GIT_ENV = {
**os.environ,
"GIT_AUTHOR_NAME": "test",
Expand Down Expand Up @@ -130,19 +123,13 @@ def git_repo(tmp_path: Path) -> Path:
return tmp_path


def test_from_git_indexes_local_repo(mock_model: Any, git_repo: Path) -> None:
"""from_git clones a local repo and returns a populated SembleIndex."""
def test_from_git_indexes_local_repo_with_relative_paths(mock_model: Any, git_repo: Path) -> None:
"""from_git clones a local repo, indexes it, and keeps chunk paths repo-relative."""
idx = SembleIndex.from_git(str(git_repo), model=mock_model)
assert idx.stats.indexed_files >= 1
assert idx.stats.total_chunks > 0
assert any("main.py" in c.file_path for c in idx.chunks)


def test_from_git_paths_are_repo_relative(mock_model: Any, git_repo: Path) -> None:
"""Chunk file_paths are repo-relative after cloning, not absolute temp-dir paths."""
idx = SembleIndex.from_git(str(git_repo), model=mock_model)
for chunk in idx.chunks:
assert not Path(chunk.file_path).is_absolute(), f"Expected relative path, got: {chunk.file_path}"
assert all(not Path(c.file_path).is_absolute() for c in idx.chunks)


def test_from_git_with_branch(mock_model: Any, tmp_path: Path) -> None:
Expand All @@ -159,7 +146,28 @@ def test_from_git_with_branch(mock_model: Any, tmp_path: Path) -> None:
assert "feature.py" in file_names


def test_from_git_invalid_url_raises(mock_model: Any) -> None:
"""from_git raises RuntimeError when the clone fails."""
@pytest.mark.parametrize(
("kind", "expected_exc"),
[("missing", FileNotFoundError), ("file", NotADirectoryError)],
)
def test_from_path_rejects_invalid_paths(
mock_model: Any, tmp_path: Path, kind: str, expected_exc: type[Exception]
) -> None:
"""from_path raises FileNotFoundError for missing paths and NotADirectoryError for files."""
if kind == "missing":
target = tmp_path / "does_not_exist"
else:
target = tmp_path / "not_a_dir.py"
target.write_text("x = 1\n")
with pytest.raises(expected_exc):
SembleIndex.from_path(target, model=mock_model)


def test_from_git_raises_on_failure(mock_model: Any) -> None:
"""from_git raises RuntimeError when the clone fails or git is not installed."""
with pytest.raises(RuntimeError, match="git clone failed"):
SembleIndex.from_git("/nonexistent/path/that/does/not/exist", model=mock_model)

with patch("semble.index.index.subprocess.run", side_effect=FileNotFoundError):
with pytest.raises(RuntimeError, match="git is not installed"):
SembleIndex.from_git("https://github.com/x/y", model=mock_model)
Loading
Loading