GitGuardian · agateau-gg · Jun 29, 2022 · Jun 24, 2022
diff --git a/pygitguardian/client.py b/pygitguardian/client.py
@@ -236,7 +236,10 @@ def content_scan(
         extra_headers: Optional[Dict[str, str]] = None,
     ) -> Union[Detail, ScanResult]:
         """
-        content_scan handles the /scan endpoint of the API
+        content_scan handles the /scan endpoint of the API.
+
+        If document contains `0` bytes, they will be replaced with the Unicode
+        replacement character.
 
         :param filename: name of file, example: "intro.py"
         :param document: content of file
@@ -272,7 +275,10 @@ def multi_content_scan(
         extra_headers: Optional[Dict[str, str]] = None,
     ) -> Union[Detail, MultiScanResult]:
         """
-        multi_content_scan handles the /multiscan endpoint of the API
+        multi_content_scan handles the /multiscan endpoint of the API.
+
+        If documents contain `0` bytes, they will be replaced with the Unicode
+        replacement character.
 
         :param documents: List of dictionaries containing the keys document
         and, optionally, filename.

diff --git a/pygitguardian/models.py b/pygitguardian/models.py
@@ -52,7 +52,7 @@ class DocumentSchema(BaseSchema):
     document = fields.String(required=True)
 
     @validates("document")
-    def validate_document(self, document: str) -> str:
+    def validate_document(self, document: str) -> None:
         """
         validate that document is smaller than scan limit
         """
@@ -64,10 +64,12 @@ def validate_document(self, document: str) -> str:
                 )
             )
 
-        if "\x00" in document:
-            raise ValidationError("document has null characters")
-
-        return document
+    @post_load
+    def replace_0_bytes(self, in_data: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
+        doc = in_data["document"]
+        # Our API does not accept 0 bytes in documents, so replace them with the replacement character
+        in_data["document"] = doc.replace("\0", "\uFFFD")
+        return in_data
 
 
 class Document(Base):

diff --git a/tests/cassettes/document_with_0_bytes.yaml b/tests/cassettes/document_with_0_bytes.yaml
@@ -0,0 +1,65 @@
+interactions:
+  - request:
+      body: '{"document": "Hello World"}'
+      headers:
+        Accept:
+          - '*/*'
+        Accept-Encoding:
+          - gzip, deflate
+        Connection:
+          - keep-alive
+        Content-Length:
+          - '27'
+        Content-Type:
+          - application/json
+        User-Agent:
+          - pygitguardian/1.3.4 (Linux;py3.8.10)
+      method: POST
+      uri: https://api.gitguardian.com/v1/scan
+    response:
+      body:
+        string:
+          '{"policy_break_count":0,"policies":["File extensions","Filenames","Secrets
+          detection"],"policy_breaks":[]}'
+      headers:
+        Access-Control-Expose-Headers:
+          - X-App-Version
+        Allow:
+          - POST, OPTIONS
+        Connection:
+          - keep-alive
+        Content-Length:
+          - '106'
+        Content-Type:
+          - application/json
+        Date:
+          - Fri, 24 Jun 2022 16:08:40 GMT
+        Referrer-Policy:
+          - strict-origin-when-cross-origin
+        Server:
+          - nginx
+        Set-Cookie:
+          - AWSALB=jzG+lNYQFwVa/HLEk17W6yiGRSKg6NTA2/1+uOmn+n5jG7J03MudYdFdbtJdN7+y9jwsoul66j7dHclQD7B8ZRa4FWTZJO3AeCHhfcZQxhwEb5uko4OvEhi9jD2o;
+            Expires=Fri, 01 Jul 2022 16:08:40 GMT; Path=/
+          - AWSALBCORS=jzG+lNYQFwVa/HLEk17W6yiGRSKg6NTA2/1+uOmn+n5jG7J03MudYdFdbtJdN7+y9jwsoul66j7dHclQD7B8ZRa4FWTZJO3AeCHhfcZQxhwEb5uko4OvEhi9jD2o;
+            Expires=Fri, 01 Jul 2022 16:08:40 GMT; Path=/; SameSite=None; Secure
+        Strict-Transport-Security:
+          - max-age=31536000; includeSubDomains
+        Vary:
+          - Cookie
+        X-App-Version:
+          - v2.7.5
+        X-Content-Type-Options:
+          - nosniff
+          - nosniff
+        X-Frame-Options:
+          - DENY
+          - SAMEORIGIN
+        X-Secrets-Engine-Version:
+          - 2.69.0
+        X-XSS-Protection:
+          - 1; mode=block
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -362,12 +362,6 @@ def test_multi_content_scan(
             r"file exceeds the maximum allowed size",
             id="too large file",
         ),
-        pytest.param(
-            "dwhewe\x00ddw",
-            ValidationError,
-            r"document has null characters",
-            id="invalid type",
-        ),
     ],
 )
 def test_content_scan_exceptions(
@@ -437,6 +431,14 @@ def test_content_not_ok():
             True,
             id="secret with validity",
         ),
+        pytest.param(
+            "document_with_0_bytes",
+            {"document": "Hello\0World"},
+            0,
+            False,
+            False,
+            id="Document containing a 0 byte",
+        ),
         pytest.param(
             "filename",
             {"filename": FILENAME, "document": "normal"},

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -32,6 +32,12 @@ def test_document_model(self):
         assert isinstance(document.to_dict(), dict)
         assert isinstance(str(document), str)
 
+    def test_document_handle_0_bytes(self):
+        document = Document.SCHEMA.load(
+            {"filename": "name", "document": "hello\0world"}
+        )
+        assert document["document"] == "hello\uFFFDworld"
+
     @pytest.mark.parametrize(
         "schema_klass, expected_klass, instance_data",
         [