From 230dc7267185af76b5e55728f9b695a1b7a7596a Mon Sep 17 00:00:00 2001 From: Walter Weinmann Date: Mon, 4 Jul 2022 01:01:55 +0200 Subject: [PATCH] Determining the numbered lists. --- docs/release_notes.md | 1 + src/dcr/nlp/cls_line_type_heading.py | 325 ++++++--------------------- 2 files changed, 68 insertions(+), 258 deletions(-) diff --git a/docs/release_notes.md b/docs/release_notes.md index 00e083e4..6a0615b9 100644 --- a/docs/release_notes.md +++ b/docs/release_notes.md @@ -13,6 +13,7 @@ Release Date: dd.mm.2022 - API documentation added - Determination of bulleted lists. +- Determination of numbered lists. - Determination of headings. ### 1.2 Modified Features diff --git a/src/dcr/nlp/cls_line_type_heading.py b/src/dcr/nlp/cls_line_type_heading.py index f7442774..6cec7328 100644 --- a/src/dcr/nlp/cls_line_type_heading.py +++ b/src/dcr/nlp/cls_line_type_heading.py @@ -60,9 +60,16 @@ def __init__(self) -> None: self._lt_heading_max_level_curr = 0 - self._heading_rules: list[ - tuple[str, bool, str, collections.abc.Callable[[str, str], bool], list[str]] - ] = self._init_heading_rules() + self._line_lines_idx = 0 + + self._level_prev = 0 + + self._max_line_line = 0 + self._max_page = 0 + + self._page_idx = 0 + + self._rules: list[tuple[str, bool, str, collections.abc.Callable[[str, str], bool], list[str]]] = self._init_rules() # ----------------------------------------------------------------------------- # Heading rules collection. @@ -80,12 +87,12 @@ def __init__(self) -> None: # 6: regexp_str: # regular expression # ----------------------------------------------------------------------------- - self._heading_rules_collection: list[ + self._rules_collection: list[ tuple[str, bool, re.Pattern[str], collections.abc.Callable[[str, str], bool], list[str], str] ] = [] - for (rule_name, is_first_token, regexp_str, function_is_asc, start_values) in self._heading_rules: - self._heading_rules_collection.append( + for (rule_name, is_first_token, regexp_str, function_is_asc, start_values) in self._rules: + self._rules_collection.append( ( rule_name.ljust(self._RULE_NAME_SIZE), is_first_token, @@ -97,7 +104,7 @@ def __init__(self) -> None: ) # ----------------------------------------------------------------------------- - # Heading rules hierarchy for determining the headings. + # Rules hierarchy for determining the headings. # ----------------------------------------------------------------------------- # 1: rule_name # 2: is_first_token: @@ -118,7 +125,7 @@ def __init__(self) -> None: # 9: regexp_str: # regular expression # ----------------------------------------------------------------------------- - self._heading_rules_hierarchy: list[ + self._rules_hierarchy: list[ tuple[ str, bool, @@ -132,15 +139,6 @@ def __init__(self) -> None: ] ] = [] - self._line_lines_idx = 0 - - self._level_prev = 0 - - self._max_line_line = 0 - self._max_page = 0 - - self._page_idx = 0 - # [ # { # "headingLevel": 99, @@ -160,44 +158,6 @@ def __init__(self) -> None: cfg.glob.logger.debug(cfg.glob.LOGGER_END) - # ----------------------------------------------------------------------------- - # Convert a roman numeral to integer. - # ----------------------------------------------------------------------------- - @classmethod - def _convert_roman_2_int(cls, roman: str) -> int: - """Convert a roman numeral to integer. - - Args: - roman (str): The roman numeral. - - Returns: - int: The corresponding integer. - """ - tallies = { - "i": 1, - "v": 5, - "x": 10, - "l": 50, - "c": 100, - "d": 500, - "m": 1000, - # specify more numerals if you wish - } - - integer: int = 0 - - for i in range(len(roman) - 1): - left = roman[i] - right = roman[i + 1] - if tallies[left] < tallies[right]: - integer -= tallies[left] - else: - integer += tallies[left] - - integer += tallies[roman[-1]] - - return integer - # ----------------------------------------------------------------------------- # Create a table of content entry. # ----------------------------------------------------------------------------- @@ -247,7 +207,7 @@ def _create_toc_entry(self, level: int, text: str) -> None: page_idx = new_page_idx if cfg.glob.setup.is_lt_heading_file_incl_regexp: - toc_entry[nlp.cls_nlp_core.NLPCore.JSON_NAME_REGEXP] = self._heading_rules_hierarchy[level - 1][8] + toc_entry[nlp.cls_nlp_core.NLPCore.JSON_NAME_REGEXP] = self._rules_hierarchy[level - 1][8] self._toc.append(toc_entry) @@ -314,14 +274,20 @@ def _get_next_body_line( # 5: start_values: # list of strings # ----------------------------------------------------------------------------- - def _init_heading_rules(self) -> list[tuple[str, bool, str, collections.abc.Callable[[str, str], bool], list[str]]]: + def _init_rules(self) -> list[tuple[str, bool, str, collections.abc.Callable[[str, str], bool], list[str]]]: + """Initialise the heading rules. + + Returns: + list[tuple[str, bool, str, collections.abc.Callable[[str, str], bool], list[str]]]: + The valid heading rules. + """ if cfg.glob.setup.lt_heading_rule_file and cfg.glob.setup.lt_heading_rule_file.lower() != "none": lt_heading_rule_file_path = utils.get_os_independent_name(cfg.glob.setup.lt_heading_rule_file) if os.path.isfile(lt_heading_rule_file_path): - return self._load_heading_rules_from_json(pathlib.Path(lt_heading_rule_file_path)) + return self._load_rules_from_json(pathlib.Path(lt_heading_rule_file_path)) utils.terminate_fatal( - f"File with heading rule file is missing - " f"file name '{cfg.glob.setup.lt_heading_rule_file}'" + f"File with heading rules is missing - " f"file name '{cfg.glob.setup.lt_heading_rule_file}'" ) return [ @@ -329,325 +295,168 @@ def _init_heading_rules(self) -> list[tuple[str, bool, str, collections.abc.Call "(999)", True, r"\(\d+\)$", - self._is_asc_string_integers, + nlp.cls_nlp_core.NLPCore.is_asc_string_integers, ["(1)"], ), ( "(A)", True, r"\([A-Z]\)$", - self._is_asc_uppercase_letters, + nlp.cls_nlp_core.NLPCore.is_asc_uppercase_letters, ["(A)"], ), ( "(ROM)", True, r"\(M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\)$", - self._is_asc_romans, + nlp.cls_nlp_core.NLPCore.is_asc_romans, ["(I)"], ), ( "(a)", True, r"\([a-z]\)$", - self._is_asc_lowercase_letters, + nlp.cls_nlp_core.NLPCore.is_asc_lowercase_letters, ["(a)"], ), ( "(rom)", True, r"\(m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\)$", - self._is_asc_romans, + nlp.cls_nlp_core.NLPCore.is_asc_romans, ["(i)"], ), ( "999)", True, r"\d+\)$", - self._is_asc_string_integers, + nlp.cls_nlp_core.NLPCore.is_asc_string_integers, ["1)"], ), ( "999.", True, r"\d+\.$", - self._is_asc_string_integers, + nlp.cls_nlp_core.NLPCore.is_asc_string_integers, ["1."], ), ( "999.999", True, r"\d+\.\d\d\d$", - self._is_asc_string_floats, + nlp.cls_nlp_core.NLPCore.is_asc_string_floats, ["1.000", "1.001"], ), ( "999.99", True, r"\d+\.\d\d$", - self._is_asc_string_floats, + nlp.cls_nlp_core.NLPCore.is_asc_string_floats, ["1.00", "1.01"], ), ( "999.9", True, r"\d+\.\d$", - self._is_asc_string_floats, + nlp.cls_nlp_core.NLPCore.is_asc_string_floats, ["1.0", "1.1"], ), ( "A)", True, r"[A-Z]\)$", - self._is_asc_uppercase_letters, + nlp.cls_nlp_core.NLPCore.is_asc_uppercase_letters, ["A)"], ), ( "A.", True, r"[A-Z]\.$", - self._is_asc_uppercase_letters, + nlp.cls_nlp_core.NLPCore.is_asc_uppercase_letters, ["A."], ), ( "ROM)", True, r"M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\)$", - self._is_asc_romans, + nlp.cls_nlp_core.NLPCore.is_asc_romans, ["I)"], ), ( "ROM.", True, r"M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\.$", - self._is_asc_romans, + nlp.cls_nlp_core.NLPCore.is_asc_romans, ["I."], ), ( "a)", True, r"[a-z]\)$", - self._is_asc_lowercase_letters, + nlp.cls_nlp_core.NLPCore.is_asc_lowercase_letters, ["a)"], ), ( "a.", True, r"[a-z]\.$", - self._is_asc_lowercase_letters, + nlp.cls_nlp_core.NLPCore.is_asc_lowercase_letters, ["a."], ), ( "rom)", True, r"m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\)$", - self._is_asc_romans, + nlp.cls_nlp_core.NLPCore.is_asc_romans, ["i)"], ), ( "rom.", True, r"m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\.$", - self._is_asc_romans, + nlp.cls_nlp_core.NLPCore.is_asc_romans, ["i."], ), ] # ----------------------------------------------------------------------------- - # Ignore the comparison. - # ----------------------------------------------------------------------------- - @classmethod - def _is_asc_ignore(cls, _predecessor: str, _successor: str) -> bool: - """Ignore the comparison. - - Returns: - bool: True. - """ - return True - - # ----------------------------------------------------------------------------- - # Compare two lowercase letters on difference ascending 1. - # ----------------------------------------------------------------------------- - @classmethod - def _is_asc_lowercase_letters(cls, predecessor: str, successor: str) -> bool: - """Compare two lowercase_letters on ascending. - - Args: - predecessor (str): The previous string. - successor (str): The current string. - - Returns: - bool: True, if the successor - predecessor is equal to 1, False else. - """ - if (predecessor_ints := re.findall(r"[a-z]", predecessor.lower())) and ( - successor_ints := re.findall(r"[a-z]", successor.lower()) - ): - if ord(successor_ints[0]) - ord(predecessor_ints[0]) == 1: - return True - - return False - - # ----------------------------------------------------------------------------- - # Compare two roman numerals on ascending. - # ----------------------------------------------------------------------------- - @classmethod - def _is_asc_romans(cls, predecessor: str, successor: str) -> bool: - """Compare two roman numerals on ascending. - - Args: - predecessor (str): The previous roman numeral. - successor (str): The current roman numeral. - - Returns: - bool: False, if the predecessor is greater than the current value, True else. - """ - # TBD depending on different regexp patterns - # if predecessor[0] == "(": - # predecessor_net = predecessor[1:-1] - # successor_net = successor[1:-1] - # else: - # predecessor_net = predecessor - # successor_net = successor - - if predecessor[0:1] == "(": - predecessor_net = predecessor[1:] - else: - predecessor_net = predecessor - if predecessor_net[-1] in {")", "."}: - predecessor_net = predecessor_net[:-1] - - if successor[0:1] == "(": - successor_net = successor[1:] - else: - successor_net = successor - if successor_net[-1] in {")", "."}: - successor_net = successor_net[:-1] - - if ( - LineTypeHeading._convert_roman_2_int(successor_net.lower()) - - LineTypeHeading._convert_roman_2_int(predecessor_net.lower()) - == 1 - ): - return True - - return False - - # ----------------------------------------------------------------------------- - # Compare two strings on ascending. - # ----------------------------------------------------------------------------- - @classmethod - def _is_asc_strings(cls, predecessor: str, successor: str) -> bool: - """Compare two strings on ascending. - - Args: - predecessor (str): The previous string. - successor (str): The current string. - - Returns: - bool: False, if the predecessor is greater than the current value, True else. - """ - if predecessor > successor: - return False - - return True - - # ----------------------------------------------------------------------------- - # Compare two string floats on ascending. - # ----------------------------------------------------------------------------- - @classmethod - def _is_asc_string_floats(cls, predecessor: str, successor: str) -> bool: - """Compare two string float numbers on ascending. - - Args: - predecessor (str): The previous string float number. - successor (str): The current string float number. - - Returns: - bool: False, if the predecessor is greater than the current value, True else. - """ - if (predecessor_floats := re.findall(r"\d+\.\d+", predecessor)) and ( - successor_floats := re.findall(r"\d+\.\d+", successor) - ): - if 0 < float(successor_floats[0]) - float(predecessor_floats[0]) <= 1: - return True - - return False - - # ----------------------------------------------------------------------------- - # Compare two string integers on difference ascending 1. - # ----------------------------------------------------------------------------- - @classmethod - def _is_asc_string_integers(cls, predecessor: str, successor: str) -> bool: - """Compare two string integers on ascending. - - Args: - predecessor (str): The previous string integer. - successor (str): The current string integer. - - Returns: - bool: True, if the successor - predecessor is equal to 1, False else. - """ - if (predecessor_ints := re.findall(r"\d+", predecessor)) and (successor_ints := re.findall(r"\d+", successor)): - if int(successor_ints[0]) - int(predecessor_ints[0]) == 1: - return True - - return False - - # ----------------------------------------------------------------------------- - # Compare two uppercase letters on difference ascending 1. - # ----------------------------------------------------------------------------- - @classmethod - def _is_asc_uppercase_letters(cls, predecessor: str, successor: str) -> bool: - """Compare two uppercase_letters on ascending. - - Args: - predecessor (str): The previous string. - successor (str): The current string. - - Returns: - bool: True, if the successor - predecessor is equal to 1, False else. - """ - if (predecessor_ints := re.findall(r"[A-Z]", predecessor.upper())) and ( - successor_ints := re.findall(r"[A-Z]", successor.upper()) - ): - if ord(successor_ints[0]) - ord(predecessor_ints[0]) == 1: - return True - - return False - - # ----------------------------------------------------------------------------- - # Load heading rules from a JSON file. + # Load the valid heading rules from a JSON file. # ----------------------------------------------------------------------------- @staticmethod - def _load_heading_rules_from_json( + def _load_rules_from_json( lt_heading_rule_file: pathlib.Path, ) -> list[tuple[str, bool, str, collections.abc.Callable[[str, str], bool], list[str]]]: - """Load heading rules from a JSON file. + """Load the valid heading rules from a JSON file. Args: - lt_heading_rule_file (Path): JSON file. + lt_heading_rule_file (Path): + JSON file. + + Returns: + list[tuple[str, bool, str, collections.abc.Callable[[str, str], bool], list[str]]]: + The valid heading rules from the JSON file, """ - heading_rules = [] + rules = [] with open(lt_heading_rule_file, "r", encoding=cfg.glob.FILE_ENCODING_DEFAULT) as file_handle: json_data = json.load(file_handle) - for heading_rule in json_data[nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_TYPE_HEADING_RULES]: - heading_rules.append( + for rule in json_data[nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_TYPE_HEADING_RULES]: + rules.append( ( - heading_rule[nlp.cls_nlp_core.NLPCore.JSON_NAME_NAME], - heading_rule[nlp.cls_nlp_core.NLPCore.JSON_NAME_IS_FIRST_TOKEN], - heading_rule[nlp.cls_nlp_core.NLPCore.JSON_NAME_REGEXP], + rule[nlp.cls_nlp_core.NLPCore.JSON_NAME_NAME], + rule[nlp.cls_nlp_core.NLPCore.JSON_NAME_IS_FIRST_TOKEN], + rule[nlp.cls_nlp_core.NLPCore.JSON_NAME_REGEXP], getattr( - LineTypeHeading, "_is_asc_" + heading_rule[nlp.cls_nlp_core.NLPCore.JSON_NAME_FUNCTION_IS_ASC] + nlp.cls_nlp_core.NLPCore, "is_asc_" + rule[nlp.cls_nlp_core.NLPCore.JSON_NAME_FUNCTION_IS_ASC] ), - heading_rule[nlp.cls_nlp_core.NLPCore.JSON_NAME_START_VALUES], + rule[nlp.cls_nlp_core.NLPCore.JSON_NAME_START_VALUES], ) ) utils.progress_msg(f"The heading rules were successfully loaded from the file {cfg.glob.setup.lt_heading_rule_file}") - return heading_rules + return rules # ----------------------------------------------------------------------------- # Process the line-related data. @@ -675,7 +484,7 @@ def _process_line(self, line_line: dict[str, str]) -> int: # noqa: C901 first_token = text.split()[0] coord_llx_curr = line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_COORD_LLX] - for ph_idx in reversed(range(ph_size := len(self._heading_rules_hierarchy))): + for ph_idx in reversed(range(ph_size := len(self._rules_hierarchy))): ( rule_name, is_first_token, @@ -686,7 +495,7 @@ def _process_line(self, line_line: dict[str, str]) -> int: # noqa: C901 coord_llx, predecessor, regexp_str, - ) = self._heading_rules_hierarchy[ph_idx] + ) = self._rules_hierarchy[ph_idx] target_value = first_token if is_first_token else text @@ -700,7 +509,7 @@ def _process_line(self, line_line: dict[str, str]) -> int: # noqa: C901 return 0 if function_is_asc(predecessor, target_value): - self._heading_rules_hierarchy[ph_idx] = ( + self._rules_hierarchy[ph_idx] = ( rule_name, is_first_token, regexp_compiled, @@ -724,7 +533,7 @@ def _process_line(self, line_line: dict[str, str]) -> int: # noqa: C901 # Delete levels that are no longer needed if ph_size > level: for i in range(ph_size - 1, level - 1, -1): - del self._heading_rules_hierarchy[i] + del self._rules_hierarchy[i] return level @@ -737,7 +546,7 @@ def _process_line(self, line_line: dict[str, str]) -> int: # noqa: C901 function_is_asc, start_values, regexp_str, - ) in self._heading_rules_collection: + ) in self._rules_collection: target_value = first_token if is_first_token else text if regexp_compiled.match(target_value): if is_first_token and start_values: @@ -747,7 +556,7 @@ def _process_line(self, line_line: dict[str, str]) -> int: # noqa: C901 if (level := self._level_prev + 1) > cfg.glob.setup.lt_heading_max_level: return 0 - self._heading_rules_hierarchy.append( + self._rules_hierarchy.append( ( rule_name, is_first_token,