Skip to content

Commit

Permalink
✨get_all_pruefis only look through one file per pruefi
Browse files Browse the repository at this point in the history
collect_pruefis.py collect name of file where pruefi is found in. get_all_pruefis then only looks through that file, which speeds up the search for all pruefis.
  • Loading branch information
hf-sheese committed Dec 22, 2023
1 parent ea847e7 commit fdaa729
Show file tree
Hide file tree
Showing 4 changed files with 627 additions and 756 deletions.
48 changes: 31 additions & 17 deletions src/kohlrahbi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,23 @@


# pylint:disable=anomalous-backslash-in-string
def get_valid_pruefis(list_of_pruefis: list[str], all_known_pruefis: Optional[list[str]] = None) -> list[str]:
def get_valid_pruefis(list_of_pruefis: dict[str, str], all_known_pruefis: Optional[list[str]] = None) -> dict[str, str]:
"""
This function returns a new list with only those pruefis which match the pruefi_pattern r"^[1-9]\d{4}$".
It also supports unix wildcards like '*' and '?' iff a list of known pruefis is given.
E.g. '11*' for all pruefis starting with '11' or '*01' for all pruefis ending with '01'.
"""
result: set[str] = set()
result: dict[str, str] = {}

for pruefi in list_of_pruefis:
if ("*" in pruefi or "?" in pruefi) and all_known_pruefis:
filtered_pruefis = fnmatch.filter(all_known_pruefis, pruefi)
result = result.union(filtered_pruefis)
for filtered_pruefi in filtered_pruefis:
result.update({filtered_pruefi: list_of_pruefis.get(filtered_pruefi, "")})
elif _pruefi_pattern.match(pruefi):
result.add(pruefi)
result.update({pruefi: list_of_pruefis.get(pruefi, "")})

return sorted(list(result))
return result


def check_python_version():
Expand Down Expand Up @@ -83,7 +84,7 @@ def check_output_path(path: Path) -> None:

def load_all_known_pruefis_from_file(
path_to_all_known_pruefis: Path = Path(__file__).parent / Path("all_known_pruefis.toml"),
) -> list[str]:
) -> dict[str, str]:
"""
Loads the file which contains all known Prüfidentifikatoren.
The file may be manually updated with the script `collect_pruefis.py`.
Expand All @@ -102,7 +103,7 @@ def load_all_known_pruefis_from_file(
click.secho(f"There is no 'content' section in the toml file: {path_to_all_known_pruefis}", fg="red")
raise click.Abort()

pruefis: list[str] = content_section.get("pruefidentifikatoren")
pruefis: dict[str, str] = content_section.get("pruefidentifikatoren")
return pruefis


Expand Down Expand Up @@ -206,7 +207,7 @@ def scrape_change_histories(input_path: Path, output_path: Path) -> None:
save_change_histories_to_excel(change_history_collection, output_path)


def load_pruefis_if_empty(pruefis: list[str]) -> list[str]:
def load_pruefis_if_empty(pruefis: dict[str, str]) -> dict[str, str]:
"""
If the user did not provide any pruefis we load all known pruefis from the toml file.
"""
Expand All @@ -226,7 +227,7 @@ def validate_file_type(file_type: str):
logger.warning(message)


def validate_pruefis(pruefis: list[str]) -> list[str]:
def validate_pruefis(pruefis: dict[str, str]) -> dict[str, str]:
"""
Validate the pruefis parameter.
"""
Expand All @@ -248,9 +249,14 @@ def process_pruefi(
):
"""
Process one pruefi.
If the input path ends with .docx, we assume that the file containing the pruefi is given.
Therefore we only access that file.
"""
ahb_file_finder = DocxFileFinder.from_input_path(input_path=input_path)
ahb_file_paths = ahb_file_finder.get_docx_files_which_may_contain_searched_pruefi(pruefi)
if not input_path.suffix == ".docx":
ahb_file_finder = DocxFileFinder.from_input_path(input_path=input_path)
ahb_file_paths = ahb_file_finder.get_docx_files_which_may_contain_searched_pruefi(pruefi)
else:
ahb_file_paths = [input_path]

if not ahb_file_paths:
logger.warning("No docx file was found for pruefi '%s'", pruefi)
Expand All @@ -259,11 +265,11 @@ def process_pruefi(
for ahb_file_path in ahb_file_paths:
doc = get_or_cache_document(ahb_file_path, path_to_document_mapping)
if not doc:
continue
return

ahb_table = get_ahb_table(document=doc, pruefi=pruefi)
if not ahb_table:
continue
return

process_ahb_table(ahb_table, pruefi, output_path, file_type, collected_conditions)

Expand Down Expand Up @@ -306,7 +312,10 @@ def process_ahb_table(


def scrape_pruefis(
pruefis: list[str], input_path: Path, output_path: Path, file_type: Literal["flatahb", "csv", "xlsx", "conditions"]
pruefis: dict[str, str],
input_path: Path,
output_path: Path,
file_type: Literal["flatahb", "csv", "xlsx", "conditions"],
) -> None:
"""
starts the scraping process for provided pruefis
Expand All @@ -318,10 +327,15 @@ def scrape_pruefis(
path_to_document_mapping: dict[Path, docx.Document] = {}
collected_conditions: Optional[dict[EdifactFormat, dict[str, str]]] = {} if "conditions" in file_type else None

for pruefi in valid_pruefis:
for pruefi, filename in valid_pruefis.items():
try:
logger.info("start looking for pruefi '%s'", pruefi)
old_path = input_path
input_path = input_path.joinpath(
Path(filename)
) # To add the name of the file containing the pruefi, if known
process_pruefi(pruefi, input_path, output_path, file_type, path_to_document_mapping, collected_conditions)
input_path = old_path # Otherwise the path grows when multiple pruefis are processed
# sorry for the pokemon catch
except Exception as e: # pylint: disable=broad-except
logger.exception("Error processing pruefi '%s': %s", pruefi, str(e))
Expand Down Expand Up @@ -394,11 +408,11 @@ def main(
else:
output_path.mkdir(parents=True)
click.secho(f"I created a new directory at {output_path}", fg="yellow")

pruefis_dict = {key: "" for key in pruefis} # We use dict to be able to also store filenames
match flavour:
case "pruefi":
scrape_pruefis(
pruefis=pruefis,
pruefis=pruefis_dict,
input_path=input_path,
output_path=output_path,
file_type=file_type,
Expand Down
Loading

0 comments on commit fdaa729

Please sign in to comment.