Line type table.

KonnexionsGmbH · Jun 26, 2022 · ebae2e8 · ebae2e8
1 parent 68ff67a
commit ebae2e8
Show file tree

Hide file tree

Showing 10 changed files with 103 additions and 54 deletions.
diff --git a/docs/index.md b/docs/index.md
@@ -863,6 +863,14 @@ The following parameters control the classification of the headings:
 
 Default value: **`true`** - if true, a **`JSON`** file named `<document_name>_toc.json` is created in the file directory `data_accepted` with the identified headings.
 
+**`heading_file_incl_no_ctx`**
+
+Default value: **`1`** - the `n` lines following the heading are included as context into the **`JSON`** file.
+
+**`heading_file_incl_regexp`**
+
+Default value: **`false`** - if true, the regular expression for the heading is included in the **`JSON`** file.
+
 **`heading_max_level`**
 
 Default value: **`3`** - the maximum number of hierarchical heading levels.
@@ -876,14 +884,6 @@ Default value: **`2`** - the minimum number of document pages for determining he
 Default value: **`none`** - name of a file including file directory that contains the rules for determining the headings.
 **`none`** means that the given default rules are applied.
 
-**`heading_toc_incl_no_ctx`**
-
-Default value: **`1`** - the `n` lines following the heading are included as context into the **`JSON`** file.
-
-**`heading_toc_incl_regexp`**
-
-Default value: **`false`** - if true, the regular expression for the heading is included in the **`JSON`** file.
-
 **`heading_tolerance_x`**
 
 Default value: **`5`** - percentage tolerance for the differences in indentation of a heading at the same level.

diff --git a/docs/running_configuration.md b/docs/running_configuration.md
@@ -142,11 +142,11 @@ The customisable entries are:
     directory_inbox_accepted = data/inbox_prod_accepted
     directory_inbox_rejected = data/inbox_prod_rejected
     doc_id_in_file_name = none
+    heading_file_incl_no_ctx = 1
+    heading_file_incl_regexp = false
     heading_max_level = 3
     heading_min_pages = 2
     heading_rule_file = none
-    heading_toc_incl_no_ctx = 1
-    heading_toc_incl_regexp = false
     heading_tolerance_x = 5
     ignore_duplicates = false
     initial_database_data = data/initial_database_data.json
@@ -193,11 +193,11 @@ The customisable entries are:
 | directory_inbox_accepted          | **`data/inbox_prod_accepted`**          | Directory for the accepted documents.                                                                           |
 | directory_inbox_rejected          | **`data/inbox_prod_rejected`**          | Complete file name for the **`JSON`** file with the <br>database initialisation data.                           |
 | doc_id_in_file_name               | **`none`**                              | Position of the document id in the file name : <br>**`after`**, **`before`** or **`none`**.                     |
+| heading_file_incl_no_ctx          | **`1`**                                 | The number of lines following the heading to be included as context into the **`JSON`** file.                   |
+| heading_file_incl_regexp          | **`false`**                             | If it is set to **`true`**, the regular expression for the heading is included in the **`JSON`** file.          |
 | heading_max_level                 | **`3`**                                 | Maximum level of the heading structure.                                                                         |
 | heading_min_pages                 | **`2`**                                 | Minimum number of pages to determine the headings.                                                              |
 | heading_rule_file                 | **`data/line_type_heading_rules.json`** | File with rules to determine the headings.                                                                      |
-| heading_toc_incl_no_ctx           | **`1`**                                 | The number of lines following the heading to be included as context into the **`JSON`** file.                   |
-| heading_toc_incl_regexp           | **`false`**                             | If it is set to **`true`**, the regular expression for the heading is included in the **`JSON`** file.          |
 | heading_tolerance_x               | **`5`**                                 | Tolerance of vertical indentation in percent.                                                                   |
 | ignore_duplicates                 | **`false`**                             | Accept presumably duplicated documents <br/>based on a SHA256 hash key.                                         |
 | initial_database_data             | **`data/initial_database_data.json`**   | File with initial database contents.                                                                            |

diff --git a/setup.cfg b/setup.cfg
@@ -17,16 +17,16 @@ db_password_admin = postgresql
 db_schema = dcr_schema
 db_user = dcr_user
 db_user_admin = dcr_user_admin
-delete_auxiliary_files = false
+delete_auxiliary_files = true
 directory_inbox = data/inbox_prod
 directory_inbox_accepted = data/inbox_prod_accepted
 directory_inbox_rejected = data/inbox_prod_rejected
 doc_id_in_file_name = none
+heading_file_incl_no_ctx = 1
+heading_file_incl_regexp = false
 heading_max_level = 3
 heading_min_pages = 2
 heading_rule_file = none
-heading_toc_incl_no_ctx = 1
-heading_toc_incl_regexp = false
 heading_tolerance_x = 5
 ignore_duplicates = false
 initial_database_data = data/initial_database_data.json
@@ -64,16 +64,16 @@ db_password = postgresql
 db_password_admin = postgresql
 db_user = dcr_user
 db_user_admin = dcr_user_admin
-delete_auxiliary_files = true
+delete_auxiliary_files = false
 directory_inbox = data/inbox_dev
 directory_inbox_accepted = data/inbox_dev_accepted
 directory_inbox_rejected = data/inbox_dev_rejected
 doc_id_in_file_name = none
+heading_file_incl_no_ctx = 3
+heading_file_incl_regexp = true
 heading_max_level = 3
 heading_min_pages = 2
 heading_rule_file = data/line_type_heading_rules_test.json
-heading_toc_incl_no_ctx = 3
-heading_toc_incl_regexp = true
 heading_tolerance_x = 80
 ignore_duplicates = false
 initial_database_data = data/initial_database_data_dev.json
@@ -116,11 +116,11 @@ directory_inbox = data/inbox_test
 directory_inbox_accepted = data/inbox_test_accepted
 directory_inbox_rejected = data/inbox_test_rejected
 doc_id_in_file_name = after
+heading_file_incl_no_ctx = 3
+heading_file_incl_regexp = false
 heading_max_level = 3
 heading_min_pages = 2
 heading_rule_file = data/line_type_heading_rules_test.json
-heading_toc_incl_no_ctx = 3
-heading_toc_incl_regexp = false
 heading_tolerance_x = 5
 ignore_duplicates = false
 initial_database_data = data/initial_database_data_test.json

diff --git a/src/dcr/cfg/cls_setup.py b/src/dcr/cfg/cls_setup.py
@@ -43,11 +43,11 @@ class Setup:
     _DCR_CFG_DIRECTORY_INBOX_REJECTED: ClassVar[str] = "directory_inbox_rejected"
     _DCR_CFG_DOC_ID_IN_FILE_NAME: ClassVar[str] = "doc_id_in_file_name"
     _DCR_CFG_FILE: ClassVar[str] = "setup.cfg"
+    _DCR_CFG_HEADING_FILE_INCL_NO_CTX: ClassVar[str] = "heading_file_incl_no_ctx"
+    _DCR_CFG_HEADING_FILE_INCL_REGEXP: ClassVar[str] = "heading_file_incl_regexp"
     _DCR_CFG_HEADING_MAX_LEVEL: ClassVar[str] = "heading_max_level"
     _DCR_CFG_HEADING_MIN_PAGES: ClassVar[str] = "heading_min_pages"
     _DCR_CFG_HEADING_RULE_FILE: ClassVar[str] = "heading_rule_file"
-    _DCR_CFG_HEADING_TOC_INCL_NO_CTX: ClassVar[str] = "heading_toc_incl_no_ctx"
-    _DCR_CFG_HEADING_TOC_INCL_REGEXP: ClassVar[str] = "heading_toc_incl_regexp"
     _DCR_CFG_HEADING_TOLERANCE_X: ClassVar[str] = "heading_tolerance_x"
     _DCR_CFG_IGNORE_DUPLICATES: ClassVar[str] = "ignore_duplicates"
     _DCR_CFG_INITIAL_DATABASE_DATA: ClassVar[str] = "initial_database_data"
@@ -195,14 +195,13 @@ def __init__(self) -> None:
         self.directory_inbox_accepted = utils.get_os_independent_name("data/inbox_accepted")
         self.directory_inbox_rejected = utils.get_os_independent_name("data/inbox_rejected")
         self.doc_id_in_file_name = "none"
+        self.heading_file_incl_no_ctx = 1
+
+        self.is_heading_file_incl_regexp = False
+
         self.heading_max_level = 3
         self.heading_min_pages = 2
         self.heading_rule_file = "none"
-
-        self.heading_toc_incl_no_ctx = 1
-
-        self.is_heading_toc_incl_regexp = False
-
         self.heading_tolerance_x = 5
 
         self.is_ignore_duplicates = False
@@ -349,18 +348,18 @@ def _check_config(self) -> None:
         self._check_config_directory_inbox_rejected()
         self._check_config_doc_id_in_file_name()
 
+        self.heading_file_incl_no_ctx = self._determine_config_param_integer(
+            Setup._DCR_CFG_HEADING_FILE_INCL_NO_CTX, self.heading_file_incl_no_ctx
+        )
+        self.is_heading_file_incl_regexp = self._determine_config_param_boolean(
+            Setup._DCR_CFG_HEADING_FILE_INCL_REGEXP, self.is_heading_file_incl_regexp
+        )
         self.heading_max_level = self._determine_config_param_integer(
             Setup._DCR_CFG_HEADING_MAX_LEVEL, self.heading_max_level
         )
         self.heading_min_pages = self._determine_config_param_integer(
             Setup._DCR_CFG_HEADING_MIN_PAGES, self.heading_min_pages
         )
-        self.heading_toc_incl_no_ctx = self._determine_config_param_integer(
-            Setup._DCR_CFG_HEADING_TOC_INCL_NO_CTX, self.heading_toc_incl_no_ctx
-        )
-        self.is_heading_toc_incl_regexp = self._determine_config_param_boolean(
-            Setup._DCR_CFG_HEADING_TOC_INCL_REGEXP, self.is_heading_toc_incl_regexp
-        )
         self.heading_tolerance_x = self._determine_config_param_integer(
             Setup._DCR_CFG_HEADING_TOLERANCE_X, self.heading_tolerance_x
         )
@@ -843,10 +842,10 @@ def _load_config(self) -> None:
                     | Setup._DCR_CFG_DIRECTORY_INBOX_ACCEPTED
                     | Setup._DCR_CFG_DIRECTORY_INBOX_REJECTED
                     | Setup._DCR_CFG_DOC_ID_IN_FILE_NAME
+                    | Setup._DCR_CFG_HEADING_FILE_INCL_NO_CTX
+                    | Setup._DCR_CFG_HEADING_FILE_INCL_REGEXP
                     | Setup._DCR_CFG_HEADING_MAX_LEVEL
                     | Setup._DCR_CFG_HEADING_MIN_PAGES
-                    | Setup._DCR_CFG_HEADING_TOC_INCL_NO_CTX
-                    | Setup._DCR_CFG_HEADING_TOC_INCL_REGEXP
                     | Setup._DCR_CFG_HEADING_TOLERANCE_X
                     | Setup._DCR_CFG_IGNORE_DUPLICATES
                     | Setup._DCR_CFG_JSON_INDENT

diff --git a/src/dcr/nlp/cls_line_type_heading.py b/src/dcr/nlp/cls_line_type_heading.py
@@ -244,12 +244,12 @@ def _create_toc_entry(self, level: int, text: str) -> None:
             nlp.cls_nlp_core.NLPCore.JSON_NAME_PAGE_NO: self._page_idx + 1,
         }
 
-        if cfg.glob.setup.heading_toc_incl_no_ctx > 0:
+        if cfg.glob.setup.heading_file_incl_no_ctx > 0:
             page_idx = self._page_idx
             line_lines: LineLines = cfg.glob.text_parser.parse_result_line_lines
             line_lines_idx = self._line_lines_idx + 1
 
-            for idx in range(cfg.glob.setup.heading_toc_incl_no_ctx):
+            for idx in range(cfg.glob.setup.heading_file_incl_no_ctx):
                 (line, new_page_idx, new_line_lines, new_line_lines_idx) = self._get_next_body_line(
                     page_idx, line_lines, line_lines_idx
                 )
@@ -263,7 +263,7 @@ def _create_toc_entry(self, level: int, text: str) -> None:
                 line_lines = new_line_lines
                 line_lines_idx = new_line_lines_idx
 
-        if cfg.glob.setup.is_heading_toc_incl_regexp:
+        if cfg.glob.setup.is_heading_file_incl_regexp:
             toc_entry[nlp.cls_nlp_core.NLPCore.JSON_NAME_REGEXP] = self._heading_rules_hierarchy[level - 1][8]
 
         self._toc.append(toc_entry)
@@ -649,7 +649,8 @@ def _process_line(self, line_line: dict[str, str]) -> int:  # noqa: C901
         Returns:
             int: The heading level or zero.
         """
-        text = line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_TEXT]
+        if (text := line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_TEXT]) == "":
+            return 0
 
         for (rule_name, pattern) in self._anti_patterns:
             if pattern.match(text):

diff --git a/src/dcr/nlp/cls_line_type_toc.py b/src/dcr/nlp/cls_line_type_toc.py
@@ -129,14 +129,15 @@ def _process_page_lines(self) -> None:
 
         for line_line in cfg.glob.text_parser.parse_result_line_lines:
             if line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_TYPE] == db.cls_document.Document.DOCUMENT_LINE_TYPE_BODY:
-                line_tokens = line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_TEXT].split()
-                try:
-                    self._process_toc_candidate_line_line(line_line, int(line_tokens[-1]))
-                except ValueError:
-                    self._check_toc_candidate()
-
-                if self._is_toc_existing:
-                    break
+                if (text := line_line[nlp.cls_nlp_core.NLPCore.JSON_NAME_TEXT]) != "":
+                    line_tokens = text.split()
+                    try:
+                        self._process_toc_candidate_line_line(line_line, int(line_tokens[-1]))
+                    except ValueError:
+                        self._check_toc_candidate()
+
+                    if self._is_toc_existing:
+                        break
 
         utils.progress_msg_line_type_toc(f"LineTypeToc: End   page (lines)                   ={self._page_no}")
 

diff --git a/src/dcr/nlp/cls_text_parser.py b/src/dcr/nlp/cls_text_parser.py
@@ -168,7 +168,9 @@ def __init__(self) -> None:
 
         self._parse_result_table = False
         self._parse_result_table_cell = 0
+        self._parse_result_table_cell_is_empty = True
         self._parse_result_table_col_span = 0
+        self._parse_result_table_col_span_prev = 1
         self._parse_result_table_row = 0
         self._parse_result_text = ""
 
@@ -544,15 +546,59 @@ def _parse_tag_cell(self, parent_tag: str, parent: collections.abc.Iterable[str]
         """
         self._debug_xml_element_all("Start", parent_tag, parent.attrib, parent.text)
 
-        self._parse_result_table_cell += 1
+        self._parse_result_table_cell_is_empty = True
+
+        self._parse_result_table_cell += self._parse_result_table_col_span_prev
+
         self._parse_result_table_col_span = parent.attrib.get(nlp.cls_nlp_core.NLPCore.PARSE_ATTR_COL_SPAN)
 
+        if self._parse_result_table_col_span:
+            self._parse_result_table_col_span_prev = int(self._parse_result_table_col_span)
+        else:
+            self._parse_result_table_col_span_prev = 1
+
         for child in parent:
             child_tag = child.tag[nlp.cls_nlp_core.NLPCore.PARSE_ELEM_FROM :]
             match child_tag:
                 case nlp.cls_nlp_core.NLPCore.PARSE_ELEM_PARA:
+                    self._parse_result_table_cell_is_empty = False
                     self._parse_tag_para(child_tag, child)
 
+        if self._parse_result_table_cell_is_empty:
+            self._parse_result_line_llx = float(parent.attrib.get(nlp.cls_nlp_core.NLPCore.PARSE_ATTR_LLX))
+            self._parse_result_line_urx = float(parent.attrib.get(nlp.cls_nlp_core.NLPCore.PARSE_ATTR_URX))
+            if self._parse_result_table_col_span:
+                self.parse_result_line_lines.append(
+                    {
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_COLUMN_NO: self._parse_result_table_cell,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_COLUMN_SPAN: int(self._parse_result_table_col_span),
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_COORD_LLX: self._parse_result_line_llx,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_COORD_URX: self._parse_result_line_urx,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_INDEX_PAGE: self._parse_result_line_index_page,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_INDEX_PARA: self._parse_result_line_index_para,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_NO: self._parse_result_no_lines_in_para,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_TYPE: db.cls_document.Document.DOCUMENT_LINE_TYPE_BODY,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_PARA_NO: self._parse_result_no_paras_in_page,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_ROW_NO: self._parse_result_table_row,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_TEXT: "",
+                    }
+                )
+            else:
+                self.parse_result_line_lines.append(
+                    {
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_COLUMN_NO: self._parse_result_table_cell,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_COORD_LLX: self._parse_result_line_llx,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_COORD_URX: self._parse_result_line_urx,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_INDEX_PAGE: self._parse_result_line_index_page,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_INDEX_PARA: self._parse_result_line_index_para,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_NO: self._parse_result_no_lines_in_para,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_TYPE: db.cls_document.Document.DOCUMENT_LINE_TYPE_BODY,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_PARA_NO: self._parse_result_no_paras_in_page,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_ROW_NO: self._parse_result_table_row,
+                        nlp.cls_nlp_core.NLPCore.JSON_NAME_TEXT: "",
+                    }
+                )
+
         self._debug_xml_element_all("End  ", parent_tag, parent.attrib, parent.text)
 
     # -----------------------------------------------------------------------------
@@ -837,6 +883,7 @@ def _parse_tag_row(self, parent_tag: str, parent: collections.abc.Iterable[str])
 
         self._parse_result_table_row += 1
         self._parse_result_table_cell = 0
+        self._parse_result_table_col_span_prev = 1
 
         for child in parent:
             child_tag = child.tag[nlp.cls_nlp_core.NLPCore.PARSE_ELEM_FROM :]

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -553,11 +553,11 @@ def fxtr_before_any_test():
         (cfg.cls_setup.Setup._DCR_CFG_DIRECTORY_INBOX_ACCEPTED, "data/inbox_test_accepted"),
         (cfg.cls_setup.Setup._DCR_CFG_DIRECTORY_INBOX_REJECTED, "data/inbox_test_rejected"),
         (cfg.cls_setup.Setup._DCR_CFG_DOC_ID_IN_FILE_NAME, "after"),
+        (cfg.cls_setup.Setup._DCR_CFG_HEADING_FILE_INCL_NO_CTX, "3"),
+        (cfg.cls_setup.Setup._DCR_CFG_HEADING_FILE_INCL_REGEXP, "false"),
         (cfg.cls_setup.Setup._DCR_CFG_HEADING_MAX_LEVEL, "3"),
         (cfg.cls_setup.Setup._DCR_CFG_HEADING_MIN_PAGES, "2"),
         (cfg.cls_setup.Setup._DCR_CFG_HEADING_RULE_FILE, "data/line_type_heading_rules_test.json"),
-        (cfg.cls_setup.Setup._DCR_CFG_HEADING_TOC_INCL_NO_CTX, "3"),
-        (cfg.cls_setup.Setup._DCR_CFG_HEADING_TOC_INCL_REGEXP, "false"),
         (cfg.cls_setup.Setup._DCR_CFG_HEADING_TOLERANCE_X, "5"),
         (cfg.cls_setup.Setup._DCR_CFG_IGNORE_DUPLICATES, "false"),
         (cfg.cls_setup.Setup._DCR_CFG_INITIAL_DATABASE_DATA, "data/initial_database_data_test.json"),

diff --git a/tests/test_all.py b/tests/test_all.py
@@ -987,6 +987,7 @@ def check_db_content_version() -> None:
 # -----------------------------------------------------------------------------
 # Test RUN_ACTION_PROCESS_ALL_COMPLETE - delete_auxiliary_files = true.
 # -----------------------------------------------------------------------------
+@pytest.mark.issue
 def test_run_action_process_all_complete_auxiliary_deleted(fxtr_setup_empty_db_and_inbox):
     """Test RUN_ACTION_PROCESS_ALL_COMPLETE - delete_auxiliary_files = true."""
     cfg.glob.logger.debug(cfg.glob.LOGGER_START)