Merge pull request #163 from LUMC/issue160

Add an encoding key to the YAML. Only open and read files once at maximum.
LUMC · Dec 21, 2022 · 714a0aa · 714a0aa
2 parents ef7f312 + 75de066
commit 714a0aa
Show file tree

Hide file tree

Showing 10 changed files with 101 additions and 64 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -12,6 +12,10 @@ version 2.0.0-dev
 + Python 3.6 is no longer supported. It has been removed from github actions,
   as such we can no longer guarantee that pytest-workflow works properly
   with python 3.6.
++ Added an optional encoding key for files, stdout and stderr so the file can
+  be opened with the proper encoding.
++ Make content tests more efficient by reading each file only once instead of
+  twice when there are both strings and regexes to check for.
 + When the ``--git-aware`` flag is used a submodule check is performed in order
   to assert that all submodules are properly checked out. This prevents
   unclear copying errors.

diff --git a/README.rst b/README.rst
@@ -103,6 +103,7 @@ predefined tests as well as custom tests are possible.
         must_not_contain:              # A list of strings that should NOT be in the file (optional)
           - "Cock a doodle doo"
         md5sum: e583af1f8b00b53cda87ae9ead880224   # Md5sum of the file (optional)
+        encoding: UTF-8                # Encoding for the text file (optional). Defaults to system locale.
 
   - name: simple echo                  # A second workflow. Notice the starting `-` which means
     command: "echo moo"                # that workflow items are in a list. You can add as much workflows as you want
@@ -114,6 +115,7 @@ predefined tests as well as custom tests are possible.
         - "moo"
       must_not_contain:                # List of strings that should NOT be in stout (optional)
         - "Cock a doodle doo"
+      encoding: ASCII                  # Encoding for stdout (optional). Defaults to system locale.
 
   - name: mission impossible           # Also failing workflows can be tested
     tags:                              # A list of tags that can be used to select which test
@@ -130,6 +132,7 @@ predefined tests as well as custom tests are possible.
         - "BSOD error, please contact the IT crowd"
       must_not_contain:                # A list of strings which should NOT be in stderr (optional)
         - "Mission accomplished!"
+      encoding: UTF-16                 # Encoding for stderr (optional). Defaults to system locale.
 
   - name: regex tests
     command: echo Hello, world

diff --git a/docs/writing_tests.rst b/docs/writing_tests.rst
@@ -40,6 +40,7 @@ Test options
         must_not_contain:              # A list of strings that should NOT be in the file (optional)
           - "Cock a doodle doo"
         md5sum: e583af1f8b00b53cda87ae9ead880224   # Md5sum of the file (optional)
+        encoding: UTF-8                # Encoding for the text file (optional). Defaults to system locale.
 
   - name: simple echo                  # A second workflow. Notice the starting `-` which means
     command: "echo moo"                # that workflow items are in a list. You can add as much workflows as you want
@@ -51,6 +52,7 @@ Test options
         - "moo"
       must_not_contain:                # List of strings that should NOT be in stout (optional)
         - "Cock a doodle doo"
+      encoding: ASCII                  # Encoding for stdout (optional). Defaults to system locale.
 
   - name: mission impossible           # Also failing workflows can be tested
     tags:                              # A list of tags that can be used to select which test
@@ -60,13 +62,14 @@ Test options
     files:
       - path: "fail.log"               # Multiple files can be tested for each workflow
       - path: "TomCruise.txt.gz"       # Gzipped files can also be searched, provided their extension is '.gz'
-        contains: 
+        contains:
           - "starring"
     stderr:                            # Options for testing stderr (optional)
       contains:                        # A list of strings which should be in stderr (optional)
         - "BSOD error, please contact the IT crowd"
       must_not_contain:                # A list of strings which should NOT be in stderr (optional)
         - "Mission accomplished!"
+      encoding: UTF-16                 # Encoding for stderr (optional). Defaults to system locale.
 
   - name: regex tests
     command: echo Hello, world

diff --git a/src/pytest_workflow/content_tests.py b/src/pytest_workflow/content_tests.py
@@ -33,72 +33,42 @@
 
 
 def check_content(strings: Iterable[str],
-                  text_lines: Iterable[str]) -> Set[str]:
+                  patterns: Iterable[str],
+                  text_lines: Iterable[str]):
     """
-    Checks whether any of the strings is present in the text lines
+    Checks whether any of the strings or patterns is present in the text lines
     It only reads the lines once and it stops reading when
-    everything is found. This makes searching for strings in large bodies of
-    text more efficient.
-    :param strings: A list of strings for which the present is checked
+    everything is found. This makes searching for strings  and patterns in
+    large bodies of text more efficient.
+    :param strings: A list of strings to check for
+    :param patterns: A list of regex patterns to check for
     :param text_lines: The lines of text that need to be searched.
-    :return: A tuple with a set of found strings, and a set of not found
-    strings
+    :return: A tuple with a set of found strings, and a set of found patterns.
     """
-
-    # Create two sets. By default all strings are not found.
     strings_to_check = set(strings)
     found_strings: Set[str] = set()
+    regex_to_match: Set[re.Pattern] = {re.compile(pattern)
+                                       for pattern in patterns}
+    found_regexes: Set[re.Pattern] = set()
 
     for line in text_lines:
         # Break the loop if all strings are found
         # Python implements fast set equality checking by checking length first
-        if found_strings == strings_to_check:
+        if not strings_to_check and not regex_to_match:
             break
 
         for string in strings_to_check:
             if string in line:
                 found_strings.add(string)
-        # Remove found strings for faster searching. This should be done
-        # outside of the loop above.
-        strings_to_check -= found_strings
-    return found_strings
-
-
-def check_regex_content(patterns: Iterable[str],
-                        text_lines: Iterable[str]) -> Set[str]:
-    """
-    Checks whether any of the patterns is present in the text lines
-    It only reads the lines once and it stops reading when
-    everything is found. This makes searching for patterns in large bodies of
-    text more efficient.
-    :param patterns: A list of regexes which is matched
-    :param text_lines: The lines of text that need to be searched.
-    :return: A tuple with a set of found regexes, and a set of not found
-    regexes
-    """
-
-    # Create two sets. By default all strings are not found.
-    regex_to_match = {re.compile(pattern) for pattern in patterns}
-    found_patterns: Set[str] = set()
-
-    for line in text_lines:
-        # Break the loop if all regexes have been matched
-        if not regex_to_match:
-            break
-
-        # Regexes we don't have to check anymore
-        to_remove = list()
         for regex in regex_to_match:
             if re.search(regex, line):
-                found_patterns.add(regex.pattern)
-                to_remove.append(regex)
+                found_regexes.add(regex)
 
-        # Remove found patterns for faster searching. This should be done
+        # Remove found strings for faster searching. This should be done
         # outside of the loop above.
-        for regex in to_remove:
-            regex_to_match.remove(regex)
-
-    return found_patterns
+        strings_to_check -= found_strings
+        regex_to_match -= found_regexes
+    return found_strings, {x.pattern for x in found_regexes}
 
 
 class ContentTestCollector(pytest.Collector):
@@ -145,13 +115,10 @@ def find_strings(self):
                      self.filepath.open)
         try:
             # Use 'rt' here explicitly as opposed to 'rb'
-            with file_open(mode='rt') as file_handler:  # type: ignore  # mypy goes crazy here otherwise  # noqa: E501
-                self.found_strings = check_content(
+            with file_open(mode='rt', encoding=self.content_test.encoding) \
+                    as file_handler:  # type: ignore  # mypy goes crazy here otherwise  # noqa: E501
+                self.found_strings, self.found_patterns = check_content(
                     strings=strings_to_check,
-                    text_lines=file_handler)
-            # Read the file again for the regex
-            with file_open(mode='rt') as file_handler:  # type: ignore  # mypy goes crazy here otherwise  # noqa: E501
-                self.found_patterns = check_regex_content(
                     patterns=patterns_to_check,
                     text_lines=file_handler)
         except FileNotFoundError:

diff --git a/src/pytest_workflow/schema.py b/src/pytest_workflow/schema.py
@@ -113,11 +113,13 @@ class ContentTest(object):
     def __init__(self, contains: Optional[List[str]] = None,
                  must_not_contain: Optional[List[str]] = None,
                  contains_regex: Optional[List[str]] = None,
-                 must_not_contain_regex: Optional[List[str]] = None):
+                 must_not_contain_regex: Optional[List[str]] = None,
+                 encoding: Optional[str] = None):
         self.contains: List[str] = contains or []
         self.must_not_contain: List[str] = must_not_contain or []
         self.contains_regex: List[str] = contains_regex or []
         self.must_not_contain_regex: List[str] = must_not_contain_regex or []
+        self.encoding: Optional[str] = encoding
 
 
 class FileTest(ContentTest):
@@ -127,7 +129,8 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
                  contains: Optional[List[str]] = None,
                  must_not_contain: Optional[List[str]] = None,
                  contains_regex: Optional[List[str]] = None,
-                 must_not_contain_regex: Optional[List[str]] = None):
+                 must_not_contain_regex: Optional[List[str]] = None,
+                 encoding: Optional[str] = None):
         """
         A container object
         :param path: the path to the file
@@ -143,7 +146,8 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
         """
         super().__init__(contains=contains, must_not_contain=must_not_contain,
                          contains_regex=contains_regex,
-                         must_not_contain_regex=must_not_contain_regex)
+                         must_not_contain_regex=must_not_contain_regex,
+                         encoding=encoding)
         self.path = Path(path)
         self.md5sum = md5sum
         self.should_exist = should_exist

diff --git a/src/pytest_workflow/schema/schema.json b/src/pytest_workflow/schema/schema.json
@@ -67,6 +67,9 @@
             "items": {
               "type": "string"
             }
+          },
+          "encoding": {
+              "type": "string"
           }
         },
         "additionalProperties": false
@@ -97,6 +100,9 @@
             "items": {
               "type": "string"
             }
+          },
+          "encoding": {
+              "type": "string"
           }
         },
         "additionalProperties": false
@@ -140,6 +146,9 @@
               "items": {
                 "type": "string"
               }
+            },
+            "encoding": {
+              "type": "string"
             }
           },
           "required": [

diff --git a/tests/test_content_functions.py b/tests/test_content_functions.py
@@ -20,7 +20,7 @@
 
 import pytest
 
-from pytest_workflow.content_tests import check_content, check_regex_content
+from pytest_workflow.content_tests import check_content
 
 LICENSE = Path(__file__).parent / "content_files" / "LICENSE"
 LICENSE_ZIPPED = LICENSE.parent / "LICENSE.gz"
@@ -48,8 +48,8 @@
 def test_check_content_succeeding(contains_strings, does_not_contain_strings):
     all_strings = set(contains_strings).union(set(does_not_contain_strings))
     with LICENSE.open("rt") as license_h:
-        found_strings = check_content(list(all_strings),
-                                      license_h)
+        found_strings, _ = check_content(
+            list(all_strings), [], license_h)
     assert set(contains_strings) == found_strings
     assert set(does_not_contain_strings) == all_strings - found_strings
 
@@ -60,8 +60,8 @@ def test_check_regex_content_succeeding(contains_regex,
                                         does_not_contain_regex):
     all_regex = set(contains_regex).union(set(does_not_contain_regex))
     with LICENSE.open("rt") as license_h:
-        found_regex = check_regex_content(list(all_regex),
-                                          license_h)
+        _, found_regex = check_content(
+            [], list(all_regex), license_h)
     assert set(contains_regex) == found_regex
     assert set(does_not_contain_regex) == all_regex - found_regex
 
@@ -72,5 +72,5 @@ def test_multiple_finds_one_line():
         "the true meaning of its creed: \"We hold these truths to be",
         "self-evident: that all men are created equal.\""]
     contains = ["dream", "day", "nation", "creed", "truths"]
-    found_strings = check_content(contains, content)
+    found_strings, _ = check_content(contains, [], content)
     assert set(contains) == found_strings
diff --git a/tests/test_encodings.py b/tests/test_encodings.py
@@ -0,0 +1,44 @@
+# Copyright (C) 2018 Leiden University Medical Center
+# This file is part of pytest-workflow
+#
+# pytest-workflow is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# pytest-workflow is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with pytest-workflow.  If not, see <https://www.gnu.org/licenses/
+
+"""Check how different encodings are handled by pytest-workflow"""
+
+import textwrap
+TEST_TEXT = """In het Nederlands komen verscheidene diakritische tekens voor.
+Een veel gebruikte is het trema ofwel de dubbele puntjes die boven een letter
+geplaatst worden vaak wanneer een meervoud een onduidelijke klank zou hebben
+zonder de verduidelijking van het trema. Zoals bijvoorbeeld: zee -> zeeën,
+bacterie -> bacteriën.
+Daarnaast worden veel diakritische tekens gebruikt in leenworden. Denk hierbij
+aan woorden als: überhaupt, crème fraîche en Curaçao."""
+
+
+def test_encoding(pytester):
+    pytester.makefile(".yml", textwrap.dedent("""
+    - name: test_encoding
+      command: "bash -c 'true'"
+      files:
+        - path: diakritische_tekens.txt
+          encoding: UTF32
+          contains:
+            - überhaupt
+            - crème fraîche
+    """))
+    test_txt = pytester.path / "diakritische_tekens.txt"
+    # UTF32 is not the default on windows and linux I believe
+    test_txt.write_text(TEST_TEXT, encoding="UTF32")
+    result = pytester.runpytest("-v")
+    assert result.ret == 0
diff --git a/tests/yamls/valid/dream_file.yaml b/tests/yamls/valid/dream_file.yaml
@@ -11,16 +11,19 @@
         - "blabla"
       must_not_contain:
         - "stuff"
+      encoding: UTF8
   stdout:
     contains:
       - "bla"
     must_not_contain:
       - "not_bla"
+    encoding: ASCII
   stderr:
     contains:
       - "bla"
     must_not_contain:
       - "not_bla"
+    encoding: UTF8
   exit_code: 127
   command: "the one string"
 - name: other test

diff --git a/tox.ini b/tox.ini
@@ -23,7 +23,7 @@ commands =
 # Documentation should build on python version 3
 [testenv:docs]
 deps=-r requirements-docs.txt
-whitelist_externals=bash
+allowlist_externals=bash
                     mkdir
                     rm
 commands=