Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace 0 bytes in document instead of raising an exception #32

Merged
merged 1 commit into from Jun 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 8 additions & 2 deletions pygitguardian/client.py
Expand Up @@ -236,7 +236,10 @@ def content_scan(
extra_headers: Optional[Dict[str, str]] = None,
) -> Union[Detail, ScanResult]:
"""
content_scan handles the /scan endpoint of the API
content_scan handles the /scan endpoint of the API.

If document contains `0` bytes, they will be replaced with the Unicode
replacement character.

:param filename: name of file, example: "intro.py"
:param document: content of file
Expand Down Expand Up @@ -272,7 +275,10 @@ def multi_content_scan(
extra_headers: Optional[Dict[str, str]] = None,
) -> Union[Detail, MultiScanResult]:
"""
multi_content_scan handles the /multiscan endpoint of the API
multi_content_scan handles the /multiscan endpoint of the API.

If documents contain `0` bytes, they will be replaced with the Unicode
replacement character.

:param documents: List of dictionaries containing the keys document
and, optionally, filename.
Expand Down
12 changes: 7 additions & 5 deletions pygitguardian/models.py
Expand Up @@ -52,7 +52,7 @@ class DocumentSchema(BaseSchema):
document = fields.String(required=True)

@validates("document")
def validate_document(self, document: str) -> str:
def validate_document(self, document: str) -> None:
"""
validate that document is smaller than scan limit
"""
Expand All @@ -64,10 +64,12 @@ def validate_document(self, document: str) -> str:
)
)

if "\x00" in document:
raise ValidationError("document has null characters")

return document
@post_load
def replace_0_bytes(self, in_data: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
doc = in_data["document"]
# Our API does not accept 0 bytes in documents, so replace them with the replacement character
in_data["document"] = doc.replace("\0", "\uFFFD")
return in_data


class Document(Base):
Expand Down
65 changes: 65 additions & 0 deletions tests/cassettes/document_with_0_bytes.yaml
@@ -0,0 +1,65 @@
interactions:
- request:
body: '{"document": "Hello World"}'
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '27'
Content-Type:
- application/json
User-Agent:
- pygitguardian/1.3.4 (Linux;py3.8.10)
method: POST
uri: https://api.gitguardian.com/v1/scan
response:
body:
string:
'{"policy_break_count":0,"policies":["File extensions","Filenames","Secrets
detection"],"policy_breaks":[]}'
headers:
Access-Control-Expose-Headers:
- X-App-Version
Allow:
- POST, OPTIONS
Connection:
- keep-alive
Content-Length:
- '106'
Content-Type:
- application/json
Date:
- Fri, 24 Jun 2022 16:08:40 GMT
Referrer-Policy:
- strict-origin-when-cross-origin
Server:
- nginx
Set-Cookie:
- AWSALB=jzG+lNYQFwVa/HLEk17W6yiGRSKg6NTA2/1+uOmn+n5jG7J03MudYdFdbtJdN7+y9jwsoul66j7dHclQD7B8ZRa4FWTZJO3AeCHhfcZQxhwEb5uko4OvEhi9jD2o;
Expires=Fri, 01 Jul 2022 16:08:40 GMT; Path=/
- AWSALBCORS=jzG+lNYQFwVa/HLEk17W6yiGRSKg6NTA2/1+uOmn+n5jG7J03MudYdFdbtJdN7+y9jwsoul66j7dHclQD7B8ZRa4FWTZJO3AeCHhfcZQxhwEb5uko4OvEhi9jD2o;
Expires=Fri, 01 Jul 2022 16:08:40 GMT; Path=/; SameSite=None; Secure
Strict-Transport-Security:
- max-age=31536000; includeSubDomains
Vary:
- Cookie
X-App-Version:
- v2.7.5
X-Content-Type-Options:
- nosniff
- nosniff
X-Frame-Options:
- DENY
- SAMEORIGIN
X-Secrets-Engine-Version:
- 2.69.0
X-XSS-Protection:
- 1; mode=block
status:
code: 200
message: OK
version: 1
14 changes: 8 additions & 6 deletions tests/test_client.py
Expand Up @@ -362,12 +362,6 @@ def test_multi_content_scan(
r"file exceeds the maximum allowed size",
id="too large file",
),
pytest.param(
"dwhewe\x00ddw",
ValidationError,
r"document has null characters",
id="invalid type",
),
],
)
def test_content_scan_exceptions(
Expand Down Expand Up @@ -437,6 +431,14 @@ def test_content_not_ok():
True,
id="secret with validity",
),
pytest.param(
"document_with_0_bytes",
{"document": "Hello\0World"},
0,
False,
False,
id="Document containing a 0 byte",
),
pytest.param(
"filename",
{"filename": FILENAME, "document": "normal"},
Expand Down
6 changes: 6 additions & 0 deletions tests/test_models.py
Expand Up @@ -32,6 +32,12 @@ def test_document_model(self):
assert isinstance(document.to_dict(), dict)
assert isinstance(str(document), str)

def test_document_handle_0_bytes(self):
document = Document.SCHEMA.load(
{"filename": "name", "document": "hello\0world"}
)
assert document["document"] == "hello\uFFFDworld"

@pytest.mark.parametrize(
"schema_klass, expected_klass, instance_data",
[
Expand Down