fix: do not stop when scanning a file ending with \0 (#155)

Extract the decoding logic used when reading files from Docker images and reuse it when scanning files from the file system.
GitGuardian · Apr 15, 2022 · 9483b88 · 9483b88
1 parent 7c34d65
commit 9483b88
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 21 deletions.
diff --git a/ggshield/path.py b/ggshield/path.py
@@ -1,7 +1,7 @@
 import os
 import re
 from pathlib import Path
-from typing import Iterable, List, Set, Union
+from typing import Iterable, Iterator, List, Set, Union
 
 import click
 
@@ -86,8 +86,8 @@ def get_filepaths(
     return targets
 
 
-def generate_files_from_paths(paths: Iterable[str], verbose: bool) -> Iterable[File]:
-    """Generate a list of scannable files from a list of filepaths."""
+def generate_files_from_paths(paths: Iterable[str], verbose: bool) -> Iterator[File]:
+    """Loop on filepaths and return an iterator on scannable files."""
     for path in paths:
         if os.path.isdir(path) or not os.path.exists(path):
             continue
@@ -101,10 +101,7 @@ def generate_files_from_paths(paths: Iterable[str], verbose: bool) -> Iterable[F
             if verbose:
                 click.echo(f"ignoring binary file extension: {path}")
             continue
-        with open(path, "r") as file:
-            try:
-                content = file.read()
-                if content:
-                    yield File(content, file.name, file_size)
-            except UnicodeDecodeError:
-                pass
+        with open(path, "rb") as file:
+            content = file.read()
+            if content:
+                yield File.from_bytes(content, file.name)
diff --git a/ggshield/scan/docker.py b/ggshield/scan/docker.py
@@ -158,17 +158,11 @@ def _get_layer_files(archive: tarfile.TarFile, layer_info: Dict) -> Iterable[Fil
         if file is None:
             continue
 
-        file_content_raw = file.read()
-        if len(file_content_raw) > MAX_FILE_SIZE * 0.95:
+        file_content = file.read()
+        if len(file_content) > MAX_FILE_SIZE * 0.95:
             continue
 
-        file_content = (
-            file_content_raw.decode(errors="replace")
-            .replace("\0", " ")  # null character, not replaced by replace
-            .replace("\uFFFD", " ")  # replacement character
-        )
-        yield File(
-            document=file_content,
-            filename=os.path.join(archive.name, layer_filename, file_info.name),  # type: ignore # noqa
-            filesize=file_info.size,
+        yield File.from_bytes(
+            raw_document=file_content,
+            filename=os.path.join(archive.name, layer_filename, file_info.name),  # type: ignore
         )
diff --git a/ggshield/scan/scannable.py b/ggshield/scan/scannable.py
@@ -67,6 +67,15 @@ def __init__(self, document: str, filename: str, filesize: Optional[int] = None)
         self.filemode = Filemode.FILE
         self.filesize = filesize if filesize else len(self.document.encode("utf-8"))
 
+    @staticmethod
+    def from_bytes(raw_document: bytes, filename: str) -> "File":
+        document = (
+            raw_document.decode(errors="replace")
+            .replace("\0", " ")  # errors="replace" keeps `\0`, remove it
+            .replace("\uFFFD", " ")  # replacement character
+        )
+        return File(document, filename)
+
     @property
     def scan_dict(self) -> Dict[str, Any]:
         """Return a payload compatible with the scanning API."""

diff --git a/tests/test_path.py b/tests/test_path.py
@@ -0,0 +1,58 @@
+from pathlib import Path
+from typing import Callable
+
+import pytest
+
+from ggshield.path import generate_files_from_paths
+
+
+@pytest.mark.parametrize(
+    ["filename", "input_content", "expected_content"],
+    [
+        ("normal.txt", "Normal", "Normal"),
+        ("0-inside.txt", "Ins\0de", "Ins de"),
+    ],
+)
+def test_generate_files_from_paths(
+    tmp_path, filename: str, input_content: str, expected_content: str
+):
+    """
+    GIVEN a file
+    WHEN calling generate_files_from_paths() on it
+    THEN it returns the expected File instance
+    AND the content of the File instance is what is expected
+    """
+    path = tmp_path / filename
+    Path(path).write_text(input_content)
+
+    files = list(generate_files_from_paths([str(path)], verbose=False))
+
+    file = files[0]
+    assert file.filename == str(path)
+    assert file.document == expected_content
+
+    assert len(files) == 1
+
+
+@pytest.mark.parametrize(
+    ["filename", "creator"],
+    [
+        ("a_binary_file.tar", lambda x: x.write_text("Uninteresting")),
+        ("big_file", lambda x: x.write_text(2_000_000 * " ")),
+        ("i_am_a_dir", lambda x: x.mkdir()),
+    ],
+)
+def test_generate_files_from_paths_skips_files(
+    tmp_path, filename: str, creator: Callable[[Path], None]
+):
+    """
+    GIVEN a file which should be skipped
+    WHEN calling generate_files_from_paths() on it
+    THEN it should return an empty list
+    """
+    path = tmp_path / filename
+    creator(path)
+
+    files = list(generate_files_from_paths([str(path)], verbose=False))
+
+    assert files == []