Memorize which File contains which Pruefi Tables (#209)

* 🚧Update collect_pruefis.py to also collect filenames * ✨get_all_pruefis only look through one file per pruefi collect_pruefis.py collect name of file where pruefi is found in. get_all_pruefis then only looks through that file, which speeds up the search for all pruefis. * Use "None" if no file path to a pruefi is given * Remove Union[str, None] with str | None * 🐛Remove bug when dict has wilcdards as keys Changed tests to detect those bugs in the future * 📝Update Readme and comments * ✅Add test for preufe: None as input * Update method naming and comments * Only send pruefis to validation, reattach filename afterwards * Rename content.pruefidentifikatoren in toml to pruefidentifikatoren
Hochfrequenz · Jan 19, 2024 · e9b66d4 · e9b66d4
1 parent 642a2bc
commit e9b66d4
Show file tree

Hide file tree

Showing 5 changed files with 584 additions and 522 deletions.
diff --git a/README.md b/README.md
@@ -96,21 +96,21 @@ The easiest way to be compliant with this naming schema is to clone our [edi_ene
 If you want to extract a specific prüfidentifikator, you can run the following command.
 
 ```bash
-kohlrahbi --input_path ../edi_energy_mirror/edi_energy_de/current --output_path ./output/ --pruefis 11039 --file-type xslx
+kohlrahbi --input_path ../edi_energy_mirror/edi_energy_de/current --output_path ./output/ --pruefis 13002 --file-type xlsx
 ```
 
 You can also provide multiple prüfidentifikatoren.
 
 ```bash
-kohlrahbi --input_path ../edi_energy_mirror/edi_energy_de/current --output_path ./output/ --pruefis 11039 --pruefis 11040 --pruefi 11041 --file-type csv
+kohlrahbi --input_path ../edi_energy_mirror/edi_energy_de/current --output_path ./output/ --pruefis 13002 --pruefis 13003 --pruefis 13005 --file-type csv
 ```
 ### Results
 There is a kohlrahbi based CI pipeline from the edi_energy_mirror mentioned above to the repository [machine-readable_anwendungshandbuecher](https://github.com/Hochfrequenz/machine-readable_anwendungshandbuecher) where you can find scraped AHBs as JSON, CSV or Excel files.
 
 ### Export ConditionKeys and ConditionTexts
 For example to export condition.json files to [edi_energy_ahb_conditions_and_packages](https://github.com/Hochfrequenz/edi_energy_ahb_conditions_and_packages). Works best if no flags for "Prüfindentifikatoren" (--pruefis). In this case all known "Prüfidentifikatoren" are scanned. Thus all related conditions are gathered.
 ```bash
-kohlrahbi --file-type conditions --input_path "Path\to\edi_energy_mirror\edi_energy_de\current" --output_path "Path\to\edi_energy_ahb_conditions_and_packages\aktuelleFV"
+kohlrahbi --file-type conditions --input_path ../edi_energy_mirror/edi_energy_de/current --output_path ./output/edi_energy_ahb_conditions_and_packages/aktuelleFV
 ```
 
 ## Workflow

diff --git a/src/kohlrahbi/__init__.py b/src/kohlrahbi/__init__.py
@@ -29,8 +29,8 @@
 # pylint:disable=anomalous-backslash-in-string
 def get_valid_pruefis(list_of_pruefis: list[str], all_known_pruefis: Optional[list[str]] = None) -> list[str]:
     """
-    This function returns a new list with only those pruefis which match the pruefi_pattern r"^[1-9]\d{4}$".
-    It also supports unix wildcards like '*' and '?' iff a list of known pruefis is given.
+    This function returns a list with only those pruefis which match the pruefi_pattern r"^[1-9]\d{4}$".
+    It also supports unix wildcards like '*' and '?' if a list of known pruefis is given.
     E.g. '11*' for all pruefis starting with '11' or '*01' for all pruefis ending with '01'.
     """
     result: set[str] = set()
@@ -83,7 +83,7 @@ def check_output_path(path: Path) -> None:
 
 def load_all_known_pruefis_from_file(
     path_to_all_known_pruefis: Path = Path(__file__).parent / Path("all_known_pruefis.toml"),
-) -> list[str]:
+) -> dict[str, str | None]:
     """
     Loads the file which contains all known Prüfidentifikatoren.
     The file may be manually updated with the script `collect_pruefis.py`.
@@ -93,17 +93,18 @@ def load_all_known_pruefis_from_file(
         state_of_kohlrahbi: dict[str, Any] = tomlkit.load(file)
 
     meta_data_section = state_of_kohlrahbi.get("meta_data")
-    content_section = state_of_kohlrahbi.get("content")
+    pruefi_to_file_mapping: dict[str, str | None] | None = state_of_kohlrahbi.get("pruefidentifikatoren", None)
 
     if meta_data_section is None:
         click.secho(f"There is no 'meta_data' section in the provided toml file: {path_to_all_known_pruefis}", fg="red")
         raise click.Abort()
-    if content_section is None:
-        click.secho(f"There is no 'content' section in the toml file: {path_to_all_known_pruefis}", fg="red")
+    if pruefi_to_file_mapping is None:
+        click.secho(
+            f"There is no 'pruefidentifikatoren' section in the toml file: {path_to_all_known_pruefis}", fg="red"
+        )
         raise click.Abort()
 
-    pruefis: list[str] = content_section.get("pruefidentifikatoren")
-    return pruefis
+    return pruefi_to_file_mapping
 
 
 def create_sheet_name(filename: str) -> str:
@@ -206,14 +207,15 @@ def scrape_change_histories(input_path: Path, output_path: Path) -> None:
     save_change_histories_to_excel(change_history_collection, output_path)
 
 
-def load_pruefis_if_empty(pruefis: list[str]) -> list[str]:
+def load_pruefis_if_empty(pruefi_to_file_mapping: dict[str, str | None]) -> dict[str, str | None]:
     """
-    If the user did not provide any pruefis we load all known pruefis from the toml file.
+    If the user did not provide any pruefis we load all known pruefis
+    and the paths to the file containing them from the toml file.
     """
-    if not pruefis:
+    if not pruefi_to_file_mapping:
         click.secho("☝️ No pruefis were given. I will parse all known pruefis.", fg="yellow")
         return load_all_known_pruefis_from_file()
-    return pruefis
+    return pruefi_to_file_mapping
 
 
 def validate_file_type(file_type: str):
@@ -228,7 +230,7 @@ def validate_file_type(file_type: str):
 
 def validate_pruefis(pruefis: list[str]) -> list[str]:
     """
-    Validate the pruefis parameter.
+    Validate the pruefi_to_file_mapping parameter.
     """
     valid_pruefis = get_valid_pruefis(pruefis)
     if not valid_pruefis:
@@ -248,9 +250,14 @@ def process_pruefi(
 ):
     """
     Process one pruefi.
+    If the input path ends with .docx, we assume that the file containing the pruefi is given.
+    Therefore we only access that file.
     """
-    ahb_file_finder = DocxFileFinder.from_input_path(input_path=input_path)
-    ahb_file_paths = ahb_file_finder.get_docx_files_which_may_contain_searched_pruefi(pruefi)
+    if not input_path.suffix == ".docx":
+        ahb_file_finder = DocxFileFinder.from_input_path(input_path=input_path)
+        ahb_file_paths = ahb_file_finder.get_docx_files_which_may_contain_searched_pruefi(pruefi)
+    else:
+        ahb_file_paths = [input_path]
 
     if not ahb_file_paths:
         logger.warning("No docx file was found for pruefi '%s'", pruefi)
@@ -259,11 +266,11 @@ def process_pruefi(
     for ahb_file_path in ahb_file_paths:
         doc = get_or_cache_document(ahb_file_path, path_to_document_mapping)
         if not doc:
-            continue
+            return
 
         ahb_table = get_ahb_table(document=doc, pruefi=pruefi)
         if not ahb_table:
-            continue
+            return
 
         process_ahb_table(ahb_table, pruefi, output_path, file_type, collected_conditions)
 
@@ -306,21 +313,31 @@ def process_ahb_table(
 
 
 def scrape_pruefis(
-    pruefis: list[str], input_path: Path, output_path: Path, file_type: Literal["flatahb", "csv", "xlsx", "conditions"]
+    pruefi_to_file_mapping: dict[str, str | None],
+    basic_input_path: Path,
+    output_path: Path,
+    file_type: Literal["flatahb", "csv", "xlsx", "conditions"],
 ) -> None:
     """
-    starts the scraping process for provided pruefis
+    starts the scraping process for provided pruefi_to_file_mappings
     """
-    pruefis = load_pruefis_if_empty(pruefis)
+    pruefi_to_file_mapping = load_pruefis_if_empty(pruefi_to_file_mapping)
     validate_file_type(file_type)
 
-    valid_pruefis = validate_pruefis(pruefis)
+    valid_pruefis = validate_pruefis(list(pruefi_to_file_mapping.keys()))
+    valid_pruefi_to_file_mappings: dict[str, str | None] = {}
+    for pruefi in valid_pruefis:
+        valid_pruefi_to_file_mappings.update({pruefi: pruefi_to_file_mapping.get(pruefi, None)})
     path_to_document_mapping: dict[Path, docx.Document] = {}
     collected_conditions: Optional[dict[EdifactFormat, dict[str, str]]] = {} if "conditions" in file_type else None
 
-    for pruefi in valid_pruefis:
+    for pruefi, filename in valid_pruefi_to_file_mappings.items():
         try:
             logger.info("start looking for pruefi '%s'", pruefi)
+            input_path = basic_input_path  # To prevent multiple adding of filenames
+            # that would happen if filenames are added but never removed
+            if filename is not None:
+                input_path = basic_input_path / Path(filename)
             process_pruefi(pruefi, input_path, output_path, file_type, path_to_document_mapping, collected_conditions)
         # sorry for the pokemon catch
         except Exception as e:  # pylint: disable=broad-except
@@ -394,12 +411,14 @@ def main(
         else:
             output_path.mkdir(parents=True)
             click.secho(f"I created a new directory at {output_path}", fg="yellow")
-
+    pruefi_to_file_mapping: dict[str, str | None] = {
+        key: None for key in pruefis
+    }  # A mapping of a pruefi (key) to the name (+ path) of the file containing the prufi
     match flavour:
         case "pruefi":
             scrape_pruefis(
-                pruefis=pruefis,
-                input_path=input_path,
+                pruefi_to_file_mapping=pruefi_to_file_mapping,
+                basic_input_path=input_path,
                 output_path=output_path,
                 file_type=file_type,
             )