Skip to content

Commit

Permalink
fix: do not stop when scanning a file ending with \0 (#155)
Browse files Browse the repository at this point in the history
Extract the decoding logic used when reading files from Docker images
and reuse it when scanning files from the file system.
  • Loading branch information
agateau-gg committed Apr 15, 2022
1 parent 7c34d65 commit 9483b88
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 21 deletions.
17 changes: 7 additions & 10 deletions ggshield/path.py
@@ -1,7 +1,7 @@
import os
import re
from pathlib import Path
from typing import Iterable, List, Set, Union
from typing import Iterable, Iterator, List, Set, Union

import click

Expand Down Expand Up @@ -86,8 +86,8 @@ def get_filepaths(
return targets


def generate_files_from_paths(paths: Iterable[str], verbose: bool) -> Iterable[File]:
"""Generate a list of scannable files from a list of filepaths."""
def generate_files_from_paths(paths: Iterable[str], verbose: bool) -> Iterator[File]:
"""Loop on filepaths and return an iterator on scannable files."""
for path in paths:
if os.path.isdir(path) or not os.path.exists(path):
continue
Expand All @@ -101,10 +101,7 @@ def generate_files_from_paths(paths: Iterable[str], verbose: bool) -> Iterable[F
if verbose:
click.echo(f"ignoring binary file extension: {path}")
continue
with open(path, "r") as file:
try:
content = file.read()
if content:
yield File(content, file.name, file_size)
except UnicodeDecodeError:
pass
with open(path, "rb") as file:
content = file.read()
if content:
yield File.from_bytes(content, file.name)
16 changes: 5 additions & 11 deletions ggshield/scan/docker.py
Expand Up @@ -158,17 +158,11 @@ def _get_layer_files(archive: tarfile.TarFile, layer_info: Dict) -> Iterable[Fil
if file is None:
continue

file_content_raw = file.read()
if len(file_content_raw) > MAX_FILE_SIZE * 0.95:
file_content = file.read()
if len(file_content) > MAX_FILE_SIZE * 0.95:
continue

file_content = (
file_content_raw.decode(errors="replace")
.replace("\0", " ") # null character, not replaced by replace
.replace("\uFFFD", " ") # replacement character
)
yield File(
document=file_content,
filename=os.path.join(archive.name, layer_filename, file_info.name), # type: ignore # noqa
filesize=file_info.size,
yield File.from_bytes(
raw_document=file_content,
filename=os.path.join(archive.name, layer_filename, file_info.name), # type: ignore
)
9 changes: 9 additions & 0 deletions ggshield/scan/scannable.py
Expand Up @@ -67,6 +67,15 @@ def __init__(self, document: str, filename: str, filesize: Optional[int] = None)
self.filemode = Filemode.FILE
self.filesize = filesize if filesize else len(self.document.encode("utf-8"))

@staticmethod
def from_bytes(raw_document: bytes, filename: str) -> "File":
document = (
raw_document.decode(errors="replace")
.replace("\0", " ") # errors="replace" keeps `\0`, remove it
.replace("\uFFFD", " ") # replacement character
)
return File(document, filename)

@property
def scan_dict(self) -> Dict[str, Any]:
"""Return a payload compatible with the scanning API."""
Expand Down
58 changes: 58 additions & 0 deletions tests/test_path.py
@@ -0,0 +1,58 @@
from pathlib import Path
from typing import Callable

import pytest

from ggshield.path import generate_files_from_paths


@pytest.mark.parametrize(
["filename", "input_content", "expected_content"],
[
("normal.txt", "Normal", "Normal"),
("0-inside.txt", "Ins\0de", "Ins de"),
],
)
def test_generate_files_from_paths(
tmp_path, filename: str, input_content: str, expected_content: str
):
"""
GIVEN a file
WHEN calling generate_files_from_paths() on it
THEN it returns the expected File instance
AND the content of the File instance is what is expected
"""
path = tmp_path / filename
Path(path).write_text(input_content)

files = list(generate_files_from_paths([str(path)], verbose=False))

file = files[0]
assert file.filename == str(path)
assert file.document == expected_content

assert len(files) == 1


@pytest.mark.parametrize(
["filename", "creator"],
[
("a_binary_file.tar", lambda x: x.write_text("Uninteresting")),
("big_file", lambda x: x.write_text(2_000_000 * " ")),
("i_am_a_dir", lambda x: x.mkdir()),
],
)
def test_generate_files_from_paths_skips_files(
tmp_path, filename: str, creator: Callable[[Path], None]
):
"""
GIVEN a file which should be skipped
WHEN calling generate_files_from_paths() on it
THEN it should return an empty list
"""
path = tmp_path / filename
creator(path)

files = list(generate_files_from_paths([str(path)], verbose=False))

assert files == []

0 comments on commit 9483b88

Please sign in to comment.