Skip to content

Commit

Permalink
Merge pull request #163 from LUMC/issue160
Browse files Browse the repository at this point in the history
Add an encoding key to the YAML. Only open and read files once at maximum.
  • Loading branch information
rhpvorderman committed Dec 21, 2022
2 parents ef7f312 + 75de066 commit 714a0aa
Show file tree
Hide file tree
Showing 10 changed files with 101 additions and 64 deletions.
4 changes: 4 additions & 0 deletions HISTORY.rst
Expand Up @@ -12,6 +12,10 @@ version 2.0.0-dev
+ Python 3.6 is no longer supported. It has been removed from github actions,
as such we can no longer guarantee that pytest-workflow works properly
with python 3.6.
+ Added an optional encoding key for files, stdout and stderr so the file can
be opened with the proper encoding.
+ Make content tests more efficient by reading each file only once instead of
twice when there are both strings and regexes to check for.
+ When the ``--git-aware`` flag is used a submodule check is performed in order
to assert that all submodules are properly checked out. This prevents
unclear copying errors.
Expand Down
3 changes: 3 additions & 0 deletions README.rst
Expand Up @@ -103,6 +103,7 @@ predefined tests as well as custom tests are possible.
must_not_contain: # A list of strings that should NOT be in the file (optional)
- "Cock a doodle doo"
md5sum: e583af1f8b00b53cda87ae9ead880224 # Md5sum of the file (optional)
encoding: UTF-8 # Encoding for the text file (optional). Defaults to system locale.
- name: simple echo # A second workflow. Notice the starting `-` which means
command: "echo moo" # that workflow items are in a list. You can add as much workflows as you want
Expand All @@ -114,6 +115,7 @@ predefined tests as well as custom tests are possible.
- "moo"
must_not_contain: # List of strings that should NOT be in stout (optional)
- "Cock a doodle doo"
encoding: ASCII # Encoding for stdout (optional). Defaults to system locale.
- name: mission impossible # Also failing workflows can be tested
tags: # A list of tags that can be used to select which test
Expand All @@ -130,6 +132,7 @@ predefined tests as well as custom tests are possible.
- "BSOD error, please contact the IT crowd"
must_not_contain: # A list of strings which should NOT be in stderr (optional)
- "Mission accomplished!"
encoding: UTF-16 # Encoding for stderr (optional). Defaults to system locale.
- name: regex tests
command: echo Hello, world
Expand Down
5 changes: 4 additions & 1 deletion docs/writing_tests.rst
Expand Up @@ -40,6 +40,7 @@ Test options
must_not_contain: # A list of strings that should NOT be in the file (optional)
- "Cock a doodle doo"
md5sum: e583af1f8b00b53cda87ae9ead880224 # Md5sum of the file (optional)
encoding: UTF-8 # Encoding for the text file (optional). Defaults to system locale.
- name: simple echo # A second workflow. Notice the starting `-` which means
command: "echo moo" # that workflow items are in a list. You can add as much workflows as you want
Expand All @@ -51,6 +52,7 @@ Test options
- "moo"
must_not_contain: # List of strings that should NOT be in stout (optional)
- "Cock a doodle doo"
encoding: ASCII # Encoding for stdout (optional). Defaults to system locale.
- name: mission impossible # Also failing workflows can be tested
tags: # A list of tags that can be used to select which test
Expand All @@ -60,13 +62,14 @@ Test options
files:
- path: "fail.log" # Multiple files can be tested for each workflow
- path: "TomCruise.txt.gz" # Gzipped files can also be searched, provided their extension is '.gz'
contains:
contains:
- "starring"
stderr: # Options for testing stderr (optional)
contains: # A list of strings which should be in stderr (optional)
- "BSOD error, please contact the IT crowd"
must_not_contain: # A list of strings which should NOT be in stderr (optional)
- "Mission accomplished!"
encoding: UTF-16 # Encoding for stderr (optional). Defaults to system locale.
- name: regex tests
command: echo Hello, world
Expand Down
73 changes: 20 additions & 53 deletions src/pytest_workflow/content_tests.py
Expand Up @@ -33,72 +33,42 @@


def check_content(strings: Iterable[str],
text_lines: Iterable[str]) -> Set[str]:
patterns: Iterable[str],
text_lines: Iterable[str]):
"""
Checks whether any of the strings is present in the text lines
Checks whether any of the strings or patterns is present in the text lines
It only reads the lines once and it stops reading when
everything is found. This makes searching for strings in large bodies of
text more efficient.
:param strings: A list of strings for which the present is checked
everything is found. This makes searching for strings and patterns in
large bodies of text more efficient.
:param strings: A list of strings to check for
:param patterns: A list of regex patterns to check for
:param text_lines: The lines of text that need to be searched.
:return: A tuple with a set of found strings, and a set of not found
strings
:return: A tuple with a set of found strings, and a set of found patterns.
"""

# Create two sets. By default all strings are not found.
strings_to_check = set(strings)
found_strings: Set[str] = set()
regex_to_match: Set[re.Pattern] = {re.compile(pattern)
for pattern in patterns}
found_regexes: Set[re.Pattern] = set()

for line in text_lines:
# Break the loop if all strings are found
# Python implements fast set equality checking by checking length first
if found_strings == strings_to_check:
if not strings_to_check and not regex_to_match:
break

for string in strings_to_check:
if string in line:
found_strings.add(string)
# Remove found strings for faster searching. This should be done
# outside of the loop above.
strings_to_check -= found_strings
return found_strings


def check_regex_content(patterns: Iterable[str],
text_lines: Iterable[str]) -> Set[str]:
"""
Checks whether any of the patterns is present in the text lines
It only reads the lines once and it stops reading when
everything is found. This makes searching for patterns in large bodies of
text more efficient.
:param patterns: A list of regexes which is matched
:param text_lines: The lines of text that need to be searched.
:return: A tuple with a set of found regexes, and a set of not found
regexes
"""

# Create two sets. By default all strings are not found.
regex_to_match = {re.compile(pattern) for pattern in patterns}
found_patterns: Set[str] = set()

for line in text_lines:
# Break the loop if all regexes have been matched
if not regex_to_match:
break

# Regexes we don't have to check anymore
to_remove = list()
for regex in regex_to_match:
if re.search(regex, line):
found_patterns.add(regex.pattern)
to_remove.append(regex)
found_regexes.add(regex)

# Remove found patterns for faster searching. This should be done
# Remove found strings for faster searching. This should be done
# outside of the loop above.
for regex in to_remove:
regex_to_match.remove(regex)

return found_patterns
strings_to_check -= found_strings
regex_to_match -= found_regexes
return found_strings, {x.pattern for x in found_regexes}


class ContentTestCollector(pytest.Collector):
Expand Down Expand Up @@ -145,13 +115,10 @@ def find_strings(self):
self.filepath.open)
try:
# Use 'rt' here explicitly as opposed to 'rb'
with file_open(mode='rt') as file_handler: # type: ignore # mypy goes crazy here otherwise # noqa: E501
self.found_strings = check_content(
with file_open(mode='rt', encoding=self.content_test.encoding) \
as file_handler: # type: ignore # mypy goes crazy here otherwise # noqa: E501
self.found_strings, self.found_patterns = check_content(
strings=strings_to_check,
text_lines=file_handler)
# Read the file again for the regex
with file_open(mode='rt') as file_handler: # type: ignore # mypy goes crazy here otherwise # noqa: E501
self.found_patterns = check_regex_content(
patterns=patterns_to_check,
text_lines=file_handler)
except FileNotFoundError:
Expand Down
10 changes: 7 additions & 3 deletions src/pytest_workflow/schema.py
Expand Up @@ -113,11 +113,13 @@ class ContentTest(object):
def __init__(self, contains: Optional[List[str]] = None,
must_not_contain: Optional[List[str]] = None,
contains_regex: Optional[List[str]] = None,
must_not_contain_regex: Optional[List[str]] = None):
must_not_contain_regex: Optional[List[str]] = None,
encoding: Optional[str] = None):
self.contains: List[str] = contains or []
self.must_not_contain: List[str] = must_not_contain or []
self.contains_regex: List[str] = contains_regex or []
self.must_not_contain_regex: List[str] = must_not_contain_regex or []
self.encoding: Optional[str] = encoding


class FileTest(ContentTest):
Expand All @@ -127,7 +129,8 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
contains: Optional[List[str]] = None,
must_not_contain: Optional[List[str]] = None,
contains_regex: Optional[List[str]] = None,
must_not_contain_regex: Optional[List[str]] = None):
must_not_contain_regex: Optional[List[str]] = None,
encoding: Optional[str] = None):
"""
A container object
:param path: the path to the file
Expand All @@ -143,7 +146,8 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
"""
super().__init__(contains=contains, must_not_contain=must_not_contain,
contains_regex=contains_regex,
must_not_contain_regex=must_not_contain_regex)
must_not_contain_regex=must_not_contain_regex,
encoding=encoding)
self.path = Path(path)
self.md5sum = md5sum
self.should_exist = should_exist
Expand Down
9 changes: 9 additions & 0 deletions src/pytest_workflow/schema/schema.json
Expand Up @@ -67,6 +67,9 @@
"items": {
"type": "string"
}
},
"encoding": {
"type": "string"
}
},
"additionalProperties": false
Expand Down Expand Up @@ -97,6 +100,9 @@
"items": {
"type": "string"
}
},
"encoding": {
"type": "string"
}
},
"additionalProperties": false
Expand Down Expand Up @@ -140,6 +146,9 @@
"items": {
"type": "string"
}
},
"encoding": {
"type": "string"
}
},
"required": [
Expand Down
12 changes: 6 additions & 6 deletions tests/test_content_functions.py
Expand Up @@ -20,7 +20,7 @@

import pytest

from pytest_workflow.content_tests import check_content, check_regex_content
from pytest_workflow.content_tests import check_content

LICENSE = Path(__file__).parent / "content_files" / "LICENSE"
LICENSE_ZIPPED = LICENSE.parent / "LICENSE.gz"
Expand Down Expand Up @@ -48,8 +48,8 @@
def test_check_content_succeeding(contains_strings, does_not_contain_strings):
all_strings = set(contains_strings).union(set(does_not_contain_strings))
with LICENSE.open("rt") as license_h:
found_strings = check_content(list(all_strings),
license_h)
found_strings, _ = check_content(
list(all_strings), [], license_h)
assert set(contains_strings) == found_strings
assert set(does_not_contain_strings) == all_strings - found_strings

Expand All @@ -60,8 +60,8 @@ def test_check_regex_content_succeeding(contains_regex,
does_not_contain_regex):
all_regex = set(contains_regex).union(set(does_not_contain_regex))
with LICENSE.open("rt") as license_h:
found_regex = check_regex_content(list(all_regex),
license_h)
_, found_regex = check_content(
[], list(all_regex), license_h)
assert set(contains_regex) == found_regex
assert set(does_not_contain_regex) == all_regex - found_regex

Expand All @@ -72,5 +72,5 @@ def test_multiple_finds_one_line():
"the true meaning of its creed: \"We hold these truths to be",
"self-evident: that all men are created equal.\""]
contains = ["dream", "day", "nation", "creed", "truths"]
found_strings = check_content(contains, content)
found_strings, _ = check_content(contains, [], content)
assert set(contains) == found_strings
44 changes: 44 additions & 0 deletions tests/test_encodings.py
@@ -0,0 +1,44 @@
# Copyright (C) 2018 Leiden University Medical Center
# This file is part of pytest-workflow
#
# pytest-workflow is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# pytest-workflow is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with pytest-workflow. If not, see <https://www.gnu.org/licenses/

"""Check how different encodings are handled by pytest-workflow"""

import textwrap
TEST_TEXT = """In het Nederlands komen verscheidene diakritische tekens voor.
Een veel gebruikte is het trema ofwel de dubbele puntjes die boven een letter
geplaatst worden vaak wanneer een meervoud een onduidelijke klank zou hebben
zonder de verduidelijking van het trema. Zoals bijvoorbeeld: zee -> zeeën,
bacterie -> bacteriën.
Daarnaast worden veel diakritische tekens gebruikt in leenworden. Denk hierbij
aan woorden als: überhaupt, crème fraîche en Curaçao."""


def test_encoding(pytester):
pytester.makefile(".yml", textwrap.dedent("""
- name: test_encoding
command: "bash -c 'true'"
files:
- path: diakritische_tekens.txt
encoding: UTF32
contains:
- überhaupt
- crème fraîche
"""))
test_txt = pytester.path / "diakritische_tekens.txt"
# UTF32 is not the default on windows and linux I believe
test_txt.write_text(TEST_TEXT, encoding="UTF32")
result = pytester.runpytest("-v")
assert result.ret == 0
3 changes: 3 additions & 0 deletions tests/yamls/valid/dream_file.yaml
Expand Up @@ -11,16 +11,19 @@
- "blabla"
must_not_contain:
- "stuff"
encoding: UTF8
stdout:
contains:
- "bla"
must_not_contain:
- "not_bla"
encoding: ASCII
stderr:
contains:
- "bla"
must_not_contain:
- "not_bla"
encoding: UTF8
exit_code: 127
command: "the one string"
- name: other test
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Expand Up @@ -23,7 +23,7 @@ commands =
# Documentation should build on python version 3
[testenv:docs]
deps=-r requirements-docs.txt
whitelist_externals=bash
allowlist_externals=bash
mkdir
rm
commands=
Expand Down

0 comments on commit 714a0aa

Please sign in to comment.