Skip to content

Commit

Permalink
Extract AHB conditions.json (#186)
Browse files Browse the repository at this point in the history
* implemented export for conditions.json files
---------

Co-authored-by: kevin <68426071+hf-krechan@users.noreply.github.com>
  • Loading branch information
DeltaDaniel and hf-krechan committed Oct 31, 2023
1 parent 48e9586 commit cdfeb02
Show file tree
Hide file tree
Showing 5 changed files with 243 additions and 1 deletion.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ kohlrahbi --input_path ../edi_energy_mirror/edi_energy_de/current --output_path
### Results
There is a kohlrahbi based CI pipeline from the edi_energy_mirror mentioned above to the repository [machine-readable_anwendungshandbuecher](https://github.com/Hochfrequenz/machine-readable_anwendungshandbuecher) where you can find scraped AHBs as JSON, CSV or Excel files.

### Export ConditionKeys and ConditionTexts
For example to export condition.json files to [edi_energy_ahb_conditions_and_packages](https://github.com/Hochfrequenz/edi_energy_ahb_conditions_and_packages). Works best if no flags for "Prüfindentifikatoren" (--pruefis). In this case all known "Prüfidentifikatoren" are scanned. Thus all related conditions are gathered.
```bash
kohlrahbi --file-type conditions --input_path "Path\to\edi_energy_mirror\edi_energy_de\current" --output_path "Path\to\edi_energy_ahb_conditions_and_packages\aktuelleFV"
```

## Workflow

```mermaid
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ attrs==23.1.0
# maus
click==8.1.7
# via -r requirements.in
colorama==0.4.6
# via
# click
# colorlog
colorlog==6.7.0
# via -r requirements.in
et-xmlfile==1.1.0
Expand Down
47 changes: 46 additions & 1 deletion src/kohlrahbi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@
"""
import fnmatch
import gc
import json
import re
import sys
from pathlib import Path
from typing import Any, Optional

import click
import docx # type:ignore[import]
import pandas as pd
import tomlkit
from maus.edifact import EdifactFormat

from kohlrahbi.ahb.ahbtable import AhbTable
from kohlrahbi.ahbfilefinder import AhbFileFinder
Expand Down Expand Up @@ -127,7 +130,7 @@ def load_all_known_pruefis_from_file(
)
@click.option(
"--file-type",
type=click.Choice(["flatahb", "csv", "xlsx"], case_sensitive=False),
type=click.Choice(["flatahb", "csv", "xlsx", "conditions"], case_sensitive=False),
multiple=True,
)
@click.option(
Expand Down Expand Up @@ -170,6 +173,10 @@ def main(pruefis: list[str], input_path: Path, output_path: Path, file_type: lis
click.secho(f"I will continue with the following valid pruefis: {valid_pruefis}.", fg="yellow")
path_to_document_mapping: dict[Path, docx.Document] = {}

if "conditions" in file_type:
# mapping of EdifactFormat to ConditionKeyConditionTextMapping for all given prufis
collected_conditions: dict[EdifactFormat, dict[str, str]] = {}

for pruefi in valid_pruefis:
try:
logger.info("start looking for pruefi '%s'", pruefi)
Expand Down Expand Up @@ -216,6 +223,11 @@ def main(pruefis: list[str], input_path: Path, output_path: Path, file_type: lis
if "csv" in file_type:
logger.info("💾 Saving csv file %s", pruefi)
unfolded_ahb.dump_csv(path_to_output_directory=output_path)

if "conditions" in file_type:
logger.info("🧺 Collecting conditions file %s", pruefi)
unfolded_ahb.collect_condition(already_known_conditions=collected_conditions)

break
except Exception as general_error: # pylint:disable=broad-except
logger.exception(
Expand All @@ -231,6 +243,39 @@ def main(pruefis: list[str], input_path: Path, output_path: Path, file_type: lis
del unfolded_ahb
gc.collect()

if "conditions" in file_type:
# store conditions in conditions.json files
dump_conditions_json(output_directory_path=output_path, already_known_conditions=collected_conditions)


def dump_conditions_json(output_directory_path: Path, already_known_conditions: dict) -> None:
"""
Writes all collected conditions to a json file.
The file will be stored in the directory:
'output_directory_path/<edifact_format>/conditions.json'
"""
for edifact_format in already_known_conditions:
condition_json_output_directory_path = output_directory_path / str(edifact_format)
condition_json_output_directory_path.mkdir(parents=True, exist_ok=True)
file_path = condition_json_output_directory_path / "conditions.json"
# resort ConditionKeyConditionTextMappings for output
sorted_condition_dict = {
k: already_known_conditions[edifact_format][k]
for k in sorted(already_known_conditions[edifact_format], key=int)
}
array = [
{"condition_key": i, "condition_text": sorted_condition_dict[i], "edifact_format": edifact_format}
for i in sorted_condition_dict
]
with open(file_path, "w", encoding="utf-8") as file:
json.dump(array, file, ensure_ascii=False, indent=2)

logger.info(
"The conditions.json file for %s is saved at %s",
edifact_format,
file_path,
)


if __name__ == "__main__":
# the parameter arguments gets provided over the CLI
Expand Down
38 changes: 38 additions & 0 deletions src/kohlrahbi/unfoldedahb/unfoldedahbtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,3 +447,41 @@ def dump_xlsx(self, path_to_output_directory: Path) -> None:
self.meta_data.pruefidentifikator,
xlsx_output_directory_path / f"{self.meta_data.pruefidentifikator}.json",
)

def collect_condition(self, already_known_conditions: dict) -> None:
"""
Collect conditions of UnfoldedAHB in dict if they are not known yet.
"""
df = self.convert_to_dataframe()

edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator)
if edifact_format is None:
logger.warning("'%s' is not a pruefidentifikator", self.meta_data.pruefidentifikator)
return
if already_known_conditions.get(edifact_format) is None:
already_known_conditions[edifact_format] = {}
# check if there are conditions:
there_are_conditions = (df["Bedingung"] != "").any()
if there_are_conditions:
for conditions_text in df["Bedingung"][df["Bedingung"] != ""]:
# Split the input into parts enclosed in square brackets and other parts
matches = re.findall(
r"\[(\d+)](.*?)(?=\[\d+]|$)",
conditions_text,
re.DOTALL,
)
for match in matches:
# make text prettier:
text = match[1].strip()
text = re.sub(r"\s+", " ", text)
# check whether condition was already collected:
condition_key_not_collected_yet = already_known_conditions[edifact_format].get(match[0]) is None
if not condition_key_not_collected_yet:
key_exits_but_shorter_text = len(text) > len(
already_known_conditions[edifact_format].get(match[0])
)
if condition_key_not_collected_yet or key_exits_but_shorter_text:
already_known_conditions[edifact_format][match[0]] = text

logger.info("The conditions for %s were collected", self.meta_data.pruefidentifikator)
del df
149 changes: 149 additions & 0 deletions unittests/test_unfolded_ahb_table.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from maus.edifact import EdifactFormat
from maus.models.anwendungshandbuch import AhbLine, AhbMetaInformation, FlatAnwendungshandbuch

from kohlrahbi.unfoldedahb import UnfoldedAhbTableMetaData
Expand Down Expand Up @@ -173,3 +174,151 @@ def test_convert_to_flat_ahb(self) -> None:

def test_convert_to_dataframe(self):
pass

def test_collect_condition(self) -> None:
meta_data = UnfoldedAhbTableMetaData(pruefidentifikator="44014")

unfolded_ahb_lines = [
UnfoldedAhbLine(
index=0,
segment_name="segment_name",
segment_gruppe=None,
segment=None,
datenelement=None,
code=None,
qualifier=None,
beschreibung=None,
bedingung_ausdruck=None,
bedingung="[1] Normale Bedingung",
),
UnfoldedAhbLine(
index=1,
segment_name="segment_name",
segment_gruppe=None,
segment=None,
datenelement=None,
code=None,
qualifier=None,
beschreibung=None,
bedingung_ausdruck=None,
bedingung="[2] 2. normale Bedingung\n[3] gefolgt von einer zweiten [4] und dritten",
),
UnfoldedAhbLine(
index=2,
segment_name="segment_name",
segment_gruppe=None,
segment=None,
datenelement=None,
code=None,
qualifier=None,
beschreibung=None,
bedingung_ausdruck=None,
bedingung="[1] Normale Bedingung mit längerem Text",
),
UnfoldedAhbLine(
index=3,
segment_name="segment_name",
segment_gruppe=None,
segment=None,
datenelement=None,
code=None,
qualifier=None,
beschreibung=None,
bedingung_ausdruck=None,
bedingung="[5] Länger Bedingung \n über mehrere\n Zeilen\n",
),
UnfoldedAhbLine(
index=4,
segment_name="segment_name",
segment_gruppe=None,
segment=None,
datenelement=None,
code=None,
qualifier=None,
beschreibung=None,
bedingung_ausdruck=None,
bedingung="[6] Länger Bedingung \n über mehrere\n Zeilen\n[7] gefolgt von noch einer\n über\n Zeilen",
),
UnfoldedAhbLine(
index=5,
segment_name="segment_name",
segment_gruppe=None,
segment=None,
datenelement=None,
code=None,
qualifier=None,
beschreibung=None,
bedingung_ausdruck=None,
bedingung="[8] Länger Bedi ngung \n mit \n zu viel White space\n\n[9] gefolgt \n über\n Zeilen",
),
UnfoldedAhbLine(
index=6,
segment_name="segment_name",
segment_gruppe=None,
segment=None,
datenelement=None,
code=None,
qualifier=None,
beschreibung=None,
bedingung_ausdruck=None,
bedingung="[keine Zahl] mit einem Text",
),
UnfoldedAhbLine(
index=7,
segment_name="segment_name",
segment_gruppe=None,
segment=None,
datenelement=None,
code=None,
qualifier=None,
beschreibung="keine Bedingung",
bedingung_ausdruck=None,
bedingung="",
),
]

unfolded_ahb = UnfoldedAhb(meta_data=meta_data, unfolded_ahb_lines=unfolded_ahb_lines)
collected_conditions: dict[EdifactFormat, dict[str, str]] = {}
unfolded_ahb.collect_condition(already_known_conditions=collected_conditions)

assert len(collected_conditions) == 1
assert EdifactFormat.UTILMD in collected_conditions
assert len(collected_conditions[EdifactFormat.UTILMD]) == 9
assert (
"1" in collected_conditions[EdifactFormat.UTILMD]
and collected_conditions[EdifactFormat.UTILMD]["1"] == "Normale Bedingung mit längerem Text"
)
assert (
"2" in collected_conditions[EdifactFormat.UTILMD]
and collected_conditions[EdifactFormat.UTILMD]["2"] == "2. normale Bedingung"
)
assert (
"3" in collected_conditions[EdifactFormat.UTILMD]
and collected_conditions[EdifactFormat.UTILMD]["3"] == "gefolgt von einer zweiten"
)
assert (
"4" in collected_conditions[EdifactFormat.UTILMD]
and collected_conditions[EdifactFormat.UTILMD]["4"] == "und dritten"
)
assert (
"5" in collected_conditions[EdifactFormat.UTILMD]
and collected_conditions[EdifactFormat.UTILMD]["5"] == "Länger Bedingung über mehrere Zeilen"
)
assert (
"6" in collected_conditions[EdifactFormat.UTILMD]
and collected_conditions[EdifactFormat.UTILMD]["6"] == "Länger Bedingung über mehrere Zeilen"
)
assert (
"7" in collected_conditions[EdifactFormat.UTILMD]
and collected_conditions[EdifactFormat.UTILMD]["7"] == "gefolgt von noch einer über Zeilen"
)
assert (
"8" in collected_conditions[EdifactFormat.UTILMD]
and collected_conditions[EdifactFormat.UTILMD]["8"] == "Länger Bedi ngung mit zu viel White space"
)
assert (
"9" in collected_conditions[EdifactFormat.UTILMD]
and collected_conditions[EdifactFormat.UTILMD]["9"] == "gefolgt über Zeilen"
)
assert "keine Zahl" not in collected_conditions[EdifactFormat.UTILMD]
assert "Normale Bedingung" not in collected_conditions[EdifactFormat.UTILMD].values()

0 comments on commit cdfeb02

Please sign in to comment.