diff --git a/src/semble/index/file_walker.py b/src/semble/index/file_walker.py index db4ceba..4a6fbc5 100644 --- a/src/semble/index/file_walker.py +++ b/src/semble/index/file_walker.py @@ -61,24 +61,18 @@ def walk_files(root: Path, extensions: Sequence[str], ignore: Sequence[str] | No :yield: Path to each file under root matching the criteria. :ytype: Path """ - # This should be a list. Traversal is done in order, so the order matters. - ignored = [] - extensions_as_patterns = [f"!*{ext}" for ext in extensions] - ignored.extend(extensions_as_patterns) - ignored.extend(sorted(_DEFAULT_IGNORED_DIRS)) - # Always give user patterns preference - ignored.extend(ignore or []) - base_spec = GitIgnoreSpec.from_lines(ignored, backend="simple") + extensions_set = frozenset(extensions) + dir_patterns = list(sorted(_DEFAULT_IGNORED_DIRS)) + list(ignore or []) + base_spec = GitIgnoreSpec.from_lines(dir_patterns, backend="simple") s = IgnoreSpec(base=root, spec=base_spec) - yield from _walk(root, [s]) + yield from _walk(root, [s], extensions_set) -def _is_ignored(path: Path, specs: list[IgnoreSpec]) -> bool: +def _is_ignored(path: Path, specs: list[IgnoreSpec]) -> tuple[bool, bool]: """Check if a path is ignored by any of the provided ignore specs.""" is_dir = path.is_dir() - # Everything starts off as unignored - ignored = not is_dir - + ignored = False + found = False for ignore_spec in specs: try: # If there is no relative path, this is invalid. @@ -100,13 +94,20 @@ def _is_ignored(path: Path, specs: list[IgnoreSpec]) -> bool: if pattern.match_file(relative_str) is not None: ignored = pattern.include + # Bypass extension filter only for negation patterns with a file + # extension suffix (e.g. !special.kjs, !*.py). Patterns without + # a suffix (e.g. !vendor/, !.github/*) target directories or + # broad globs and should not bypass extension filtering. + pat = pattern.pattern or "" + found = not ignored and bool(Path(pat.rstrip("/")).suffix) - return ignored + return ignored, found def _walk( directory: Path, inherited_specs: list[IgnoreSpec], + extensions: frozenset[str], ) -> Iterator[Path]: """Recursive function for walking files under a directory.""" spec = _load_ignore_for_dir(directory) @@ -120,10 +121,11 @@ def _walk( # Don't follow symlinks if item.is_symlink(): continue - if _is_ignored(item, inherited_specs): + is_ignored, found = _is_ignored(item, inherited_specs) + if is_ignored: continue if item.is_dir(): - yield from _walk(item, inherited_specs) - elif item.is_file(): + yield from _walk(item, inherited_specs, extensions) + elif item.is_file() and (found or item.suffix.lower() in extensions): yield item diff --git a/src/semble/version.py b/src/semble/version.py index e656ac0..d054e10 100644 --- a/src/semble/version.py +++ b/src/semble/version.py @@ -1,2 +1,2 @@ -__version_triple__ = (0, 1, 7) +__version_triple__ = (0, 1, 8) __version__ = ".".join(map(str, __version_triple__)) diff --git a/tests/test_file_walker.py b/tests/test_file_walker.py index b93fb1d..184293b 100644 --- a/tests/test_file_walker.py +++ b/tests/test_file_walker.py @@ -56,6 +56,27 @@ def _touch(path: Path, content: str = "x = 1\n") -> None: "out/*\n!out/deep/keep.py\n", set(), ), + # Explicit file negation bypasses extension filter: !special.kjs is yielded even if .kjs is not in extensions. + ( + ["special.kjs", "other.kjs", "main.py"], + None, + "*.kjs\n!special.kjs\n", + {"main.py", "special.kjs"}, + ), + # Glob negation without suffix does NOT bypass extension filter. + ( + [".github/workflows/ci.yaml", "src/main.py"], + None, + "!.github/*\n", + {"src/main.py"}, + ), + # Directory negation does NOT bypass extension filter: files inside vendor/ still need a matching extension. + ( + ["vendor/special.kjs", "vendor/main.py"], + None, + "*\n!vendor/\n", + {"vendor/main.py"}, + ), ], ) def test_walk_files_filtering( @@ -85,9 +106,9 @@ def test_walk_files_prunes_ignored_dirs(tmp_path: Path) -> None: def test_is_ignored_skips_spec_with_unrelated_base(tmp_path: Path) -> None: """An IgnoreSpec whose base is not an ancestor of the path is silently skipped. - Files default to ignored in _is_ignored. When the first spec has an - unrelated base, the ValueError is caught and the spec is skipped without - crashing. A second spec with the correct base can still un-ignore the file. + When the first spec has an unrelated base, the ValueError is caught and the + spec is skipped without crashing. A second spec with the correct base can + still ignore the file. """ from pathspec import GitIgnoreSpec @@ -108,18 +129,20 @@ def test_is_ignored_skips_spec_with_unrelated_base(tmp_path: Path) -> None: spec=GitIgnoreSpec.from_lines(["*.py"]), ) - # With only the unrelated spec the file stays in its default state (ignored) + # With only the unrelated spec the file is not ignored (spec is skipped), # and, crucially, no exception is raised. - assert _is_ignored(target_file, [unrelated_spec]) is True + ignored, _ = _is_ignored(target_file, [unrelated_spec]) + assert ignored is False - # Spec rooted at project_a that un-ignores .py files + # Spec rooted at project_a that ignores .py files matching_spec = IgnoreSpec( base=project_a, - spec=GitIgnoreSpec.from_lines(["!*.py"]), + spec=GitIgnoreSpec.from_lines(["*.py"]), ) - # The unrelated spec is safely skipped; the matching spec un-ignores the file. - assert _is_ignored(target_file, [unrelated_spec, matching_spec]) is False + # The unrelated spec is safely skipped; the matching spec ignores the file. + ignored, _ = _is_ignored(target_file, [unrelated_spec, matching_spec]) + assert ignored is True def test_walk_files_skips_symlinks(tmp_path: Path) -> None: