-
Notifications
You must be signed in to change notification settings - Fork 105
feat: Refactor ignore #100
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
e25ce3c
feat: refactor file walker
stephantul a5b376c
additional behavior
stephantul 811c84c
stop following symlinks
stephantul 8d13eac
tests
stephantul e199c3f
remove comment
stephantul 00cbe79
address comments
stephantul File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,149 +1,131 @@ | ||
| import os | ||
| from collections.abc import Iterator | ||
| from collections.abc import Iterator, Sequence | ||
| from dataclasses import dataclass | ||
| from enum import Enum | ||
| from pathlib import Path | ||
|
|
||
| from pathspec import GitIgnoreSpec | ||
|
|
||
|
|
||
| class FileCategory(str, Enum): | ||
| CODE = "CODE" | ||
| DOCUMENT = "DOCUMENT" | ||
| @dataclass(frozen=True) | ||
| class IgnoreSpec: | ||
| base: Path | ||
| spec: GitIgnoreSpec | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class FileType: | ||
| """Language and indexing policy for a file extension.""" | ||
|
|
||
| language: str | ||
| category: FileCategory | ||
|
|
||
|
|
||
| FILE_TYPES: dict[str, FileType] = { | ||
| ".py": FileType("python", FileCategory.CODE), | ||
| ".js": FileType("javascript", FileCategory.CODE), | ||
| ".jsx": FileType("javascript", FileCategory.CODE), | ||
| ".ts": FileType("typescript", FileCategory.CODE), | ||
| ".tsx": FileType("typescript", FileCategory.CODE), | ||
| ".go": FileType("go", FileCategory.CODE), | ||
| ".rs": FileType("rust", FileCategory.CODE), | ||
| ".java": FileType("java", FileCategory.CODE), | ||
| ".kt": FileType("kotlin", FileCategory.CODE), | ||
| ".kts": FileType("kotlin", FileCategory.CODE), | ||
| ".rb": FileType("ruby", FileCategory.CODE), | ||
| ".php": FileType("php", FileCategory.CODE), | ||
| ".c": FileType("c", FileCategory.CODE), | ||
| ".h": FileType("c", FileCategory.CODE), | ||
| ".cpp": FileType("cpp", FileCategory.CODE), | ||
| ".hpp": FileType("cpp", FileCategory.CODE), | ||
| ".cs": FileType("csharp", FileCategory.CODE), | ||
| ".swift": FileType("swift", FileCategory.CODE), | ||
| ".scala": FileType("scala", FileCategory.CODE), | ||
| ".sbt": FileType("scala", FileCategory.CODE), | ||
| ".ex": FileType("elixir", FileCategory.CODE), | ||
| ".exs": FileType("elixir", FileCategory.CODE), | ||
| ".dart": FileType("dart", FileCategory.CODE), | ||
| ".lua": FileType("lua", FileCategory.CODE), | ||
| ".sql": FileType("sql", FileCategory.CODE), | ||
| ".sh": FileType("bash", FileCategory.CODE), | ||
| ".bash": FileType("bash", FileCategory.CODE), | ||
| ".zig": FileType("zig", FileCategory.CODE), | ||
| ".hs": FileType("haskell", FileCategory.CODE), | ||
| ".md": FileType("markdown", FileCategory.DOCUMENT), | ||
| ".yaml": FileType("yaml", FileCategory.DOCUMENT), | ||
| ".yml": FileType("yaml", FileCategory.DOCUMENT), | ||
| ".toml": FileType("toml", FileCategory.DOCUMENT), | ||
| ".json": FileType("json", FileCategory.DOCUMENT), | ||
| } | ||
|
|
||
| DEFAULT_IGNORED_DIRS: frozenset[str] = frozenset( | ||
| _DEFAULT_IGNORED_DIRS: frozenset[str] = frozenset( | ||
| { | ||
| ".git", | ||
| ".hg", | ||
| ".svn", | ||
| "__pycache__", | ||
| "node_modules", | ||
| ".venv", | ||
| "venv", | ||
| ".tox", | ||
| ".mypy_cache", | ||
| ".pytest_cache", | ||
| ".ruff_cache", | ||
| ".cache", | ||
| ".semble", | ||
| ".next", | ||
| "dist", | ||
| "build", | ||
| ".eggs", | ||
| ".git/", | ||
| ".hg/", | ||
| ".svn/", | ||
| "__pycache__/", | ||
| "node_modules/", | ||
| ".venv/", | ||
| "venv/", | ||
| ".tox/", | ||
| ".mypy_cache/", | ||
| ".pytest_cache/", | ||
| ".ruff_cache/", | ||
| ".cache/", | ||
| ".semble/", | ||
| ".next/", | ||
| "dist/", | ||
| "build/", | ||
| ".eggs/", | ||
| } | ||
| ) | ||
|
|
||
|
|
||
| def language_for_path(path: Path) -> str | None: | ||
| """Return the language for a file path, or None for unknown extensions.""" | ||
| if spec := FILE_TYPES.get(path.suffix.lower()): | ||
| return spec.language | ||
| return None | ||
|
|
||
| def _load_ignore_for_dir(directory: Path) -> GitIgnoreSpec | None: | ||
| """Loads a gitignore and sembleignore for a dir.""" | ||
| gitignore = directory / ".gitignore" | ||
| sembleignore = directory / ".sembleignore" | ||
|
|
||
| def filter_extensions(extensions: frozenset[str] | None, *, include_text_files: bool) -> frozenset[str]: | ||
| """Return the set of file extensions to index.""" | ||
| if extensions is not None: | ||
| return extensions | ||
| # Always index code files | ||
| categories_to_include = {FileCategory.CODE} | ||
| if include_text_files: | ||
| categories_to_include.add(FileCategory.DOCUMENT) | ||
| # Return a default set of extensions | ||
| return frozenset(ext for ext, spec in FILE_TYPES.items() if spec.category in categories_to_include) | ||
|
|
||
|
|
||
| def _load_root_gitignore(root: Path) -> GitIgnoreSpec | None: | ||
| """Load the root-level .gitignore as a spec, if present.""" | ||
| gitignore = root / ".gitignore" | ||
| if not gitignore.is_file(): | ||
| return None | ||
| return GitIgnoreSpec.from_lines(gitignore.read_text(encoding="utf-8", errors="ignore").splitlines()) | ||
|
|
||
|
|
||
| def _dir_is_gitignored(gitignore: GitIgnoreSpec, rel: str) -> bool: | ||
| """Return True if rel (a POSIX path relative to the gitignore root) matches a gitignore pattern for directories.""" | ||
| ignored = False | ||
| for pattern in gitignore.patterns: | ||
| if pattern.include is not None and pattern.match_file(rel): | ||
| ignored = pattern.include | ||
| return ignored | ||
| lines = [] | ||
| if gitignore.is_file(): | ||
| lines.extend(gitignore.read_text(encoding="utf-8", errors="ignore").splitlines()) | ||
| if sembleignore.is_file(): | ||
| lines.extend(sembleignore.read_text(encoding="utf-8", errors="ignore").splitlines()) | ||
| if lines: | ||
| return GitIgnoreSpec.from_lines(lines) | ||
| return None | ||
|
|
||
|
|
||
| def walk_files(root: Path, extensions: frozenset[str], ignore: frozenset[str] | None = None) -> Iterator[Path]: | ||
| def walk_files(root: Path, extensions: Sequence[str], ignore: Sequence[str] | None = None) -> Iterator[Path]: | ||
| """Yield files under root matching extensions, skipping ignored paths. | ||
|
|
||
| Directories matching DEFAULT_IGNORED_DIRS plus any names in ignore are always | ||
| skipped. If the root contains a .gitignore, its patterns are also honoured. | ||
|
|
||
| :param root: Root directory to walk. | ||
| :param extensions: Set of file extensions to include (e.g. {".py", ".js"}). | ||
| :param ignore: Additional directory names to ignore (e.g. {"build", "dist"}). | ||
| :param extensions: List of file extensions to match. | ||
| :param ignore: Additional patterns to ignore. | ||
| :yield: Path to each file under root matching the criteria. | ||
| :ytype: Path | ||
| """ | ||
| ignore_dirs = DEFAULT_IGNORED_DIRS | (ignore or frozenset()) | ||
| gitignore = _load_root_gitignore(root) | ||
| for dirpath, dirnames, filenames in os.walk(root): | ||
| rel_dir = Path(dirpath).relative_to(root) | ||
| kept: list[str] = [] | ||
| for dirname in dirnames: | ||
| if dirname in ignore_dirs: | ||
| continue | ||
| if gitignore is not None and _dir_is_gitignored(gitignore, (rel_dir / dirname).as_posix() + "/"): | ||
| # This should be a list. Traversal is done in order, so the order matters. | ||
| ignored = [] | ||
| extensions_as_patterns = [f"!*{ext}" for ext in extensions] | ||
| ignored.extend(extensions_as_patterns) | ||
| ignored.extend(_DEFAULT_IGNORED_DIRS) | ||
| # Always give user patterns preference | ||
| ignored.extend(ignore or []) | ||
| base_spec = GitIgnoreSpec.from_lines(ignored, backend="simple") | ||
| s = IgnoreSpec(base=root, spec=base_spec) | ||
| yield from _walk(root, [s]) | ||
|
|
||
|
|
||
| def _is_ignored(path: Path, specs: list[IgnoreSpec]) -> bool: | ||
| """Check if a path is ignored by any of the provided ignore specs.""" | ||
| is_dir = path.is_dir() | ||
| # Everything starts off as unignored | ||
| ignored = not is_dir | ||
|
|
||
| for ignore_spec in specs: | ||
| try: | ||
| # If there is no relative path, this is invalid. | ||
| relative = path.relative_to(ignore_spec.base) | ||
| except ValueError: | ||
| continue | ||
|
|
||
| relative_str = relative.as_posix() | ||
| # We need to add a trailing slash. Gitignore | ||
| # matches dirs as trailing '/'. | ||
| if is_dir: | ||
| relative_str += "/" | ||
|
|
||
| # Loop over all the patterns | ||
| for pattern in ignore_spec.spec.patterns: | ||
| # This pattern doesn't do anything. | ||
| if pattern.include is None: | ||
| continue | ||
| kept.append(dirname) | ||
| dirnames[:] = kept | ||
| for filename in sorted(filenames): | ||
| file_path = Path(dirpath) / filename | ||
| if file_path.suffix.lower() not in extensions: | ||
| continue | ||
| if gitignore is not None and gitignore.match_file((rel_dir / filename).as_posix()): | ||
| continue | ||
| yield file_path | ||
|
|
||
| if pattern.match_file(relative_str) is not None: | ||
| ignored = pattern.include | ||
|
|
||
| return ignored | ||
|
|
||
|
|
||
| def _walk( | ||
| directory: Path, | ||
| inherited_specs: list[IgnoreSpec], | ||
| ) -> Iterator[Path]: | ||
| """Recursive function for walking files under a directory.""" | ||
| active_specs = inherited_specs | ||
|
|
||
| spec = _load_ignore_for_dir(directory) | ||
| if spec is not None: | ||
| active_specs = [ | ||
| *inherited_specs, | ||
| IgnoreSpec(base=directory, spec=spec), | ||
| ] | ||
|
|
||
| for item in directory.iterdir(): | ||
| # Don't follow symlinks | ||
| if item.is_symlink(): | ||
| continue | ||
| if _is_ignored(item, active_specs): | ||
| continue | ||
|
|
||
| if item.is_dir(): | ||
| yield from _walk(item, active_specs) | ||
| elif item.is_file(): | ||
| yield item |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.