Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle multiple lines correctly #28

Merged
merged 7 commits into from
Sep 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 16 additions & 14 deletions ahbextractor/helper/write_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,26 @@


def parse_paragraph_in_edifact_struktur_column_to_dataframe(
paragraph: Paragraph,
paragraphs: List[Paragraph],
dataframe: pd.DataFrame,
row_index: int,
edifact_struktur_cell_left_indent_position: int,
):
"""Parses a paragraph in the edifact struktur column and puts the information into the appropriate columns

Args:
paragraph (Paragraph): Current paragraph in the edifact struktur cell
paragraphs (Paragraph): Current paragraphs in the edifact struktur cell
dataframe (pd.DataFrame): Contains all infos
row_index (int): Current index of the DataFrame
edifact_struktur_cell_left_indent_position (int): Position of the left indent from the indicator edifact
struktur cell
"""

splitted_text_at_tabs = paragraph.text.split("\t")
tab_count = paragraph.text.count("\t")
joined_text = " ".join(p.text for p in paragraphs)
splitted_text_at_tabs = joined_text.split("\t")
tab_count = joined_text.count("\t")

# Check if the line starts on the far left
if paragraph.paragraph_format.left_indent != edifact_struktur_cell_left_indent_position:
if paragraphs[0].paragraph_format.left_indent != edifact_struktur_cell_left_indent_position:

if tab_count == 2:
dataframe.at[row_index, "Segment Gruppe"] = splitted_text_at_tabs[0]
Expand All @@ -42,13 +42,14 @@ def parse_paragraph_in_edifact_struktur_column_to_dataframe(
elif tab_count == 1:
dataframe.at[row_index, "Segment Gruppe"] = splitted_text_at_tabs[0]
dataframe.at[row_index, "Segment"] = splitted_text_at_tabs[1]
elif tab_count == 0 and not paragraph.text == "":
if paragraph.runs[0].bold:
elif tab_count == 0 and joined_text.strip() != "":
if paragraphs[0].runs[0].bold:
# Segmentgruppe: SG8
dataframe.at[row_index, "Segment Gruppe"] = splitted_text_at_tabs[0]
else:
# Segmentname: Referenzen auf die ID der\nTranche
if dataframe.at[row_index, "Segment Gruppe"] == "":
_sg_text = dataframe.at[row_index, "Segment Gruppe"]
if _sg_text == "":
# Referenzen auf die ID der
dataframe.at[row_index, "Segment Gruppe"] = splitted_text_at_tabs[0]
else:
Expand Down Expand Up @@ -155,7 +156,7 @@ def write_segment_name_to_dataframe(
# EDIFACT STRUKTUR COLUMN
for paragraph in edifact_struktur_cell.paragraphs:
parse_paragraph_in_edifact_struktur_column_to_dataframe(
paragraph=paragraph,
paragraphs=[paragraph],
dataframe=elixir.soul,
row_index=elixir.current_df_row_index,
edifact_struktur_cell_left_indent_position=elixir.edifact_struktur_left_indent_position,
Expand Down Expand Up @@ -195,7 +196,8 @@ def write_segmentgruppe_to_dataframe(

# EDIFACT STRUKTUR COLUMN
parse_paragraph_in_edifact_struktur_column_to_dataframe(
paragraph=edifact_struktur_cell.paragraphs[0],
# there might be 2 paragraphs in case of multi line headings, so we're handing over all the paragraphs
paragraphs=edifact_struktur_cell.paragraphs,
dataframe=elixir.soul,
row_index=elixir.current_df_row_index,
edifact_struktur_cell_left_indent_position=elixir.edifact_struktur_left_indent_position,
Expand Down Expand Up @@ -235,7 +237,7 @@ def write_segment_to_dataframe(

# EDIFACT STRUKTUR COLUMN
parse_paragraph_in_edifact_struktur_column_to_dataframe(
paragraph=edifact_struktur_cell.paragraphs[0],
paragraphs=edifact_struktur_cell.paragraphs,
dataframe=elixir.soul,
row_index=elixir.current_df_row_index,
edifact_struktur_cell_left_indent_position=elixir.edifact_struktur_left_indent_position,
Expand Down Expand Up @@ -318,7 +320,7 @@ def write_dataelement_to_dataframe(

# EDIFACT STRUKTUR COLUMN
parse_paragraph_in_edifact_struktur_column_to_dataframe(
paragraph=edifact_struktur_cell.paragraphs[0],
paragraphs=edifact_struktur_cell.paragraphs,
dataframe=elixir.soul,
row_index=elixir.current_df_row_index,
edifact_struktur_cell_left_indent_position=elixir.edifact_struktur_left_indent_position,
Expand Down Expand Up @@ -362,7 +364,7 @@ def write_dataelement_to_dataframe(

if edifact_struktur_cell.paragraphs[0].text != "":
parse_paragraph_in_edifact_struktur_column_to_dataframe(
paragraph=edifact_struktur_cell.paragraphs[0],
paragraphs=edifact_struktur_cell.paragraphs,
dataframe=elixir.soul,
row_index=elixir.current_df_row_index,
edifact_struktur_cell_left_indent_position=elixir.edifact_struktur_left_indent_position,
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,8 @@ profile = "black"
[tool.pylint."MESSAGES CONTROL"]
max-line-length = 120
good-names=["i", "j","k", "ex", "Run", "_", "df"]

[tool.pytest.ini_options]
pythonpath = [
"."
]
6 changes: 3 additions & 3 deletions unittests/test_write_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,10 +325,10 @@ def test_parse_paragraph_in_edifact_struktur_column_to_dataframe(

# insert text
self.test_cell.text = text_content
test_paragraph = self.test_cell.paragraphs[0]
test_paragraph = [self.test_cell.paragraphs[0]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ich weiß, ich hab das uach nicht gemacht, aber ein unittest für eine mehrzeilige überschrift wäre nice.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bin generell mit dem test setup unzufrieden da es noch recht komplex ist.
Ich nehme mal den Punkt in ein Ticket auf, damit wir es nicht vergessen.
Würde das erstmal hinten anstellen wenn es okay ist.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#30


# set left indent positon
test_paragraph.paragraph_format.left_indent = left_indent_position
test_paragraph[0].paragraph_format.left_indent = left_indent_position

# Initial two dataframes ...
df = pd.DataFrame(columns=expected_df_row.keys(), dtype="str")
Expand All @@ -340,7 +340,7 @@ def test_parse_paragraph_in_edifact_struktur_column_to_dataframe(
expected_df.loc[row_index] = initial_dataframe_row

parse_paragraph_in_edifact_struktur_column_to_dataframe(
paragraph=test_paragraph,
paragraphs=test_paragraph,
dataframe=df,
row_index=row_index,
edifact_struktur_cell_left_indent_position=self.edifact_struktur_cell_left_indent_position_of_indicator_paragraph,
Expand Down