Skip to content
This repository has been archived by the owner on May 7, 2024. It is now read-only.

Commit

Permalink
Determining the numbered lists.
Browse files Browse the repository at this point in the history
  • Loading branch information
walter-weinmann committed Jul 4, 2022
1 parent 230dc72 commit ed01039
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 57 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ lt_list_bullet_rule_file = none
lt_list_bullet_tolerance_llx = 5
lt_list_number_min_entries = 2
lt_list_number_rule_file = none
lt_list_number_tolerance_llx = 5
lt_list_number_tolerance_llx = 10
lt_table_file_incl_empty_columns = true
lt_toc_last_page = 5
lt_toc_min_entries = 5
Expand Down
2 changes: 1 addition & 1 deletion src/dcr/nlp/cls_line_type_list_bullet.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def _process_line(self, line_line: dict[str, float | int | str]) -> None: # noq
self._line_lines_idx_from = self._line_lines_idx
self._line_lines_idx_till = self._line_lines_idx
self._llx_lower_limit = round(
coord_llx := float(line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_COORD_LLX])
(coord_llx := float(line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_COORD_LLX]))
* (100 - cfg.glob.setup.lt_list_bullet_tolerance_llx)
/ 100,
2,
Expand Down
122 changes: 67 additions & 55 deletions src/dcr/nlp/cls_line_type_list_number.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
List = dict[str, Entries | float | int | str]
Lists = list[List]

RuleExtern = tuple[str, str, collections.abc.Callable[[str, str], bool], list[str]]
RuleIntern = tuple[str, re.Pattern[str], collections.abc.Callable[[str, str], bool], list[str], str]


# pylint: disable=too-many-instance-attributes
class LineTypeListNumber:
Expand Down Expand Up @@ -52,16 +55,8 @@ def __init__(self) -> None:

self._RULE_NAME_SIZE: int = 20

# page_idx, para_no, line_lines_idx, target_value
self._entries: list[
tuple[
int,
int,
int,
int,
str,
]
] = []
# page_idx, para_no, line_lines_idx_from, line_lines_idx_till, target_value
self._entries: list[list[int | str]] = []

self._line_lines_idx = -1

Expand All @@ -78,9 +73,9 @@ def __init__(self) -> None:
self._para_no = 0
self._para_no_prev = 0

self._rule: tuple[str, collections.abc.Callable[[str, str], bool], list[str], str] = () # type: ignore
self._rule: RuleIntern = () # type: ignore

self._rules: list[tuple[str, str, collections.abc.Callable[[str, str], bool], list[str]]] = self._init_rules()
self._rules: list[RuleExtern] = self._init_rules()

# -----------------------------------------------------------------------------
# Number rules collection.
Expand All @@ -95,9 +90,7 @@ def __init__(self) -> None:
# 5: regexp_str:
# regular expression
# -----------------------------------------------------------------------------
self._rules_collection: list[
tuple[str, re.Pattern[str], collections.abc.Callable[[str, str], bool], list[str], str]
] = []
self._rules_collection: list[RuleIntern] = []

for (rule_name, regexp_str, function_is_asc, start_values) in self._rules:
self._rules_collection.append(
Expand Down Expand Up @@ -152,7 +145,7 @@ def _finish_list(self) -> None:

text = []

for idx in range(line_lines_idx_from, line_lines_idx_till + 1):
for idx in range(int(line_lines_idx_from), int(line_lines_idx_till) + 1):
line_lines[idx][
nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_TYPE
] = db.cls_document.Document.DOCUMENT_LINE_TYPE_LIST_NUMBER
Expand All @@ -172,9 +165,9 @@ def _finish_list(self) -> None:
entries.append(
{
nlp.cls_nlp_core.NLPCore.JSON_NAME_ENTRY_NO: len(entries) + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_NO_PAGE_FROM: line_lines_idx_from + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_NO_PAGE_TILL: line_lines_idx_till + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_PAGE_NO: page_idx + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_NO_PAGE_FROM: int(line_lines_idx_from) + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_NO_PAGE_TILL: int(line_lines_idx_till) + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_PAGE_NO: int(page_idx) + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_PARA_NO: para_no,
nlp.cls_nlp_core.NLPCore.JSON_NAME_TEXT: " ".join(text),
}
Expand All @@ -196,8 +189,8 @@ def _finish_list(self) -> None:
nlp.cls_nlp_core.NLPCore.JSON_NAME_NUMBER: self._rule[0].rstrip(),
nlp.cls_nlp_core.NLPCore.JSON_NAME_LIST_NO: self.no_lists,
nlp.cls_nlp_core.NLPCore.JSON_NAME_NO_ENTRIES: len(entries),
nlp.cls_nlp_core.NLPCore.JSON_NAME_PAGE_NO_FROM: self._entries[0][0] + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_PAGE_NO_TILL: self._entries[-1][0] + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_PAGE_NO_FROM: int(self._entries[0][0]) + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_PAGE_NO_TILL: int(self._entries[-1][0]) + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_ENTRIES: entries,
}
)
Expand All @@ -219,12 +212,12 @@ def _finish_list(self) -> None:
# 4: start_values:
# list of strings
# -----------------------------------------------------------------------------
def _init_rules(self) -> list[tuple[str, str, collections.abc.Callable[[str, str], bool], list[str]]]:
def _init_rules(self) -> list[RuleExtern]:
"""Initialise the numbered list rules.
Returns:
list[tuple[str, str, collections.abc.Callable[[str, str], bool], list[str]]]:
The valid heading rules.
The valid numbered list rules.
"""
if cfg.glob.setup.lt_list_number_rule_file and cfg.glob.setup.lt_list_number_rule_file.lower() != "none":
lt_list_number_rule_file_path = utils.get_os_independent_name(cfg.glob.setup.lt_list_number_rule_file)
Expand Down Expand Up @@ -352,7 +345,7 @@ def _init_rules(self) -> list[tuple[str, str, collections.abc.Callable[[str, str
@staticmethod
def _load_rules_from_json(
lt_list_number_rule_file: pathlib.Path,
) -> list[tuple[str, str, collections.abc.Callable[[str, str], bool], list[str]]]:
) -> list[RuleExtern]:
"""Load numbered list rules from a JSON file.
Args:
Expand All @@ -363,7 +356,7 @@ def _load_rules_from_json(
list[tuple[str, str, collections.abc.Callable[[str, str], bool], list[str]]]:
The valid numbered list rules from the JSON file,
"""
rules = []
rules: list[RuleExtern] = []

with open(lt_list_number_rule_file, "r", encoding=cfg.glob.FILE_ENCODING_DEFAULT) as file_handle:
json_data = json.load(file_handle)
Expand Down Expand Up @@ -399,35 +392,51 @@ def _process_line(self, line_line: dict[str, float | int | str]) -> None: # noq
para_no = int(line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_PARA_NO])
target_value = str(line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_TEXT]).split()[0]

rule: tuple[str, collections.abc.Callable[[str, str], bool], list[str], str] = () # type: ignore

for (
rule_name,
regexp_compiled,
function_is_asc,
start_values,
regexp_str,
) in self._rules_collection:
if regexp_compiled.match(target_value):
rule = (rule_name, function_is_asc, start_values, regexp_str)
break

if not rule:
if self._page_idx == self._page_idx_prev and para_no == self._para_no_prev:
# Paragraph already in progress.
return

self._finish_list()
return
print(
f"wwe para_no={para_no} - target_value={target_value} - "
+ f"text={line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_TEXT]}"
)

if (
rule != self._rule
or self._llx_upper_limit
<= float(line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_COORD_LLX])
<= self._llx_lower_limit
or not rule[1](self._entries[-1][4], target_value)
):
self._finish_list()
if self._rule:
if self._rule[1].match(target_value):
print(f"wwe hit={target_value}")
if self._llx_lower_limit <= float(
line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_COORD_LLX]
) <= self._llx_upper_limit and self._rule[2](str(self._entries[-1][4]), target_value):
self._entries.append([self._page_idx, para_no, self._line_lines_idx, self._line_lines_idx, target_value])
print(f"wwe 1 self._entries={self._entries}")
self._no_entries += 1
return

self._finish_list()

rule: RuleIntern = () # type: ignore

# rule_name, regexp_compiled, function_is_asc, start_values, regexp_str,
for elem in self._rules_collection:
if not elem[1].match(target_value):
continue

if elem[3]:
if target_value not in elem[3]:
continue

rule = elem
break

if rule:
if self._rule:
self._finish_list()
else:
if self._rule:
if self._page_idx == self._page_idx_prev and para_no == self._para_no_prev:
# Paragraph already in progress.
self._entries[-1][-2] = self._line_lines_idx
print(f"wwe 2 self._entries={self._entries}")
else:
self._finish_list()

return

self._rule = rule

Expand All @@ -436,14 +445,15 @@ def _process_line(self, line_line: dict[str, float | int | str]) -> None: # noq
self._line_lines_idx_from = self._line_lines_idx
self._line_lines_idx_till = self._line_lines_idx
self._llx_lower_limit = round(
coord_llx := float(line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_COORD_LLX])
(coord_llx := float(line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_COORD_LLX]))
* (100 - cfg.glob.setup.lt_list_number_tolerance_llx)
/ 100,
2,
)
self._llx_upper_limit = round(coord_llx * (100 + cfg.glob.setup.lt_list_number_tolerance_llx) / 100, 2)

self._entries.append((self._page_idx, para_no, self._line_lines_idx, self._line_lines_idx, target_value))
self._entries.append([self._page_idx, para_no, self._line_lines_idx, self._line_lines_idx, target_value])
print(f"wwe 3 self._entries={self._entries}")

self._no_entries += 1

Expand Down Expand Up @@ -507,6 +517,8 @@ def _reset_list(self) -> None:

self._predecessor = ""

self._rule = () # type: ignore

utils.progress_msg_line_type_list_number("LineTypeListNumber: Reset the list memory")

# -----------------------------------------------------------------------------
Expand Down
Binary file modified tests/__PYTEST_FILES__/docx_list_bullet.docx
Binary file not shown.
Binary file modified tests/__PYTEST_FILES__/docx_list_bullet.pdf
Binary file not shown.
Binary file modified tests/__PYTEST_FILES__/docx_list_number.docx
Binary file not shown.
Binary file modified tests/__PYTEST_FILES__/docx_list_number.pdf
Binary file not shown.

0 comments on commit ed01039

Please sign in to comment.