Skip to content
This repository has been archived by the owner on May 7, 2024. It is now read-only.

Commit

Permalink
Determining the numbered lists.
Browse files Browse the repository at this point in the history
  • Loading branch information
walter-weinmann committed Jul 5, 2022
1 parent 7fbd0ae commit 48a084f
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 13 deletions.
2 changes: 2 additions & 0 deletions docs/running_configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ The customisable entries are:
lt_list_bullet_min_entries = 2
lt_list_bullet_rule_file = none
lt_list_bullet_tolerance_llx = 5
lt_list_number_file_incl_regexp = false
lt_list_number_min_entries = 2
lt_list_number_rule_file = none
lt_list_number_tolerance_llx = 5
Expand Down Expand Up @@ -220,6 +221,7 @@ The customisable entries are:
| lt_list_bullet_min_entries | **`2`** | Minimum number of entries to determine a bulleted list. |
| lt_list_bullet_rule_file | **`data/line_type_list_bullet_rules.json`** | File with rules to determine the bulleted lists. |
| lt_list_bullet_tolerance_llx | **`5`** | Tolerance of vertical indentation in percent. |
| lt_list_number_file_incl_regexp | **`false`** | If it is set to **`true`**, the regular expression for the numbered list is included in the **`JSON`** file. |
| lt_list_number_min_entries | **`2`** | Minimum number of entries to determine a numbered list. |
| lt_list_number_rule_file | **`data/line_type_list_number_rules.json`** | File with rules to determine the numbered lists. |
| lt_list_number_tolerance_llx | **`5`** | Tolerance of vertical indentation in percent. |
Expand Down
4 changes: 4 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ lt_heading_tolerance_llx = 5
lt_list_bullet_min_entries = 2
lt_list_bullet_rule_file = none
lt_list_bullet_tolerance_llx = 5
lt_list_number_file_incl_regexp = false
lt_list_number_min_entries = 2
lt_list_number_rule_file = none
lt_list_number_tolerance_llx = 5
Expand All @@ -66,10 +67,12 @@ db_connection_port = 5433
db_database = dcr_db_dev
db_database_admin = dcr_db_dev_admin
db_initial_data_file = data/db_initial_data_file_dev.json
delete_auxiliary_files = false
directory_inbox = data/inbox_dev
directory_inbox_accepted = data/inbox_dev_accepted
directory_inbox_rejected = data/inbox_dev_rejected
lt_heading_file_incl_regexp = true
lt_list_number_file_incl_regexp = true

[dcr.env.test]
create_extra_file_list_bullet = true
Expand Down Expand Up @@ -110,6 +113,7 @@ lt_heading_tolerance_llx = 5
lt_list_bullet_min_entries = 2
lt_list_bullet_rule_file = data/line_type_list_bullet_rules_test.json
lt_list_bullet_tolerance_llx = 5
lt_list_number_file_incl_regexp = false
lt_list_number_min_entries = 2
lt_list_number_rule_file = data/line_type_list_number_rules_test.json
lt_list_number_tolerance_llx = 5
Expand Down
10 changes: 9 additions & 1 deletion src/dcr/cfg/cls_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class Setup:
# -----------------------------------------------------------------------------
# Class variables.
# -----------------------------------------------------------------------------
_CONFIG_PARAM_NO = 125
_CONFIG_PARAM_NO = 126

_DCR_CFG_CREATE_EXTRA_FILE_LIST_BULLET: ClassVar[str] = "create_extra_file_list_bullet"
_DCR_CFG_CREATE_EXTRA_FILE_LIST_NUMBER: ClassVar[str] = "create_extra_file_list_number"
Expand Down Expand Up @@ -61,6 +61,7 @@ class Setup:
_DCR_CFG_LT_LIST_BULLET_MIN_ENTRIES: ClassVar[str] = "lt_list_bullet_min_entries"
_DCR_CFG_LT_LIST_BULLET_RULE_FILE: ClassVar[str] = "lt_list_bullet_rule_file"
_DCR_CFG_LT_LIST_BULLET_TOLERANCE_LLX: ClassVar[str] = "lt_list_bullet_tolerance_llx"
_DCR_CFG_LT_LIST_NUMBER_FILE_INCL_REGEXP: ClassVar[str] = "lt_list_number_file_incl_regexp"
_DCR_CFG_LT_LIST_NUMBER_MIN_ENTRIES: ClassVar[str] = "lt_list_number_min_entries"
_DCR_CFG_LT_LIST_NUMBER_RULE_FILE: ClassVar[str] = "lt_list_number_rule_file"
_DCR_CFG_LT_LIST_NUMBER_TOLERANCE_LLX: ClassVar[str] = "lt_list_number_tolerance_llx"
Expand Down Expand Up @@ -228,6 +229,9 @@ def __init__(self) -> None:
self.lt_list_bullet_min_entries = 2
self.lt_list_bullet_rule_file = "none"
self.lt_list_bullet_tolerance_llx = 5

self.is_lt_list_number_file_incl_regexp = False

self.lt_list_number_min_entries = 2
self.lt_list_number_rule_file = "none"
self.lt_list_number_tolerance_llx = 5
Expand Down Expand Up @@ -412,6 +416,9 @@ def _check_config(self) -> None:
self.lt_list_bullet_tolerance_llx = self._determine_config_param_integer(
Setup._DCR_CFG_LT_LIST_BULLET_TOLERANCE_LLX, self.lt_list_bullet_tolerance_llx
)
self.is_lt_list_number_file_incl_regexp = self._determine_config_param_boolean(
Setup._DCR_CFG_LT_LIST_NUMBER_FILE_INCL_REGEXP, self.is_lt_list_number_file_incl_regexp
)
self.lt_list_number_min_entries = self._determine_config_param_integer(
Setup._DCR_CFG_LT_LIST_NUMBER_MIN_ENTRIES, self.lt_list_number_min_entries
)
Expand Down Expand Up @@ -898,6 +905,7 @@ def _load_config(self) -> None:
| Setup._DCR_CFG_LT_HEADING_TOLERANCE_LLX
| Setup._DCR_CFG_LT_LIST_BULLET_MIN_ENTRIES
| Setup._DCR_CFG_LT_LIST_BULLET_TOLERANCE_LLX
| Setup._DCR_CFG_LT_LIST_NUMBER_FILE_INCL_REGEXP
| Setup._DCR_CFG_LT_LIST_NUMBER_MIN_ENTRIES
| Setup._DCR_CFG_LT_LIST_NUMBER_TOLERANCE_LLX
| Setup._DCR_CFG_LT_TABLE_FILE_INCL_EMPTY_COLUMNS
Expand Down
29 changes: 17 additions & 12 deletions src/dcr/nlp/cls_line_type_list_number.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
Entry = dict[str, int | str]
Entries = list[Entry]

List = dict[str, Entries | float | int | str]
List = dict[str, Entries | float | int | object | str]
Lists = list[List]

RuleExtern = tuple[str, str, collections.abc.Callable[[str, str], bool], list[str]]
Expand Down Expand Up @@ -182,18 +182,23 @@ def _finish_list(self) -> None:
# "noEntries": 99,
# "pageNoFrom": 99,
# "pageNoTill": 99,
# "regexp": "xxx",
# "entries": []
# },
self._lists.append(
{
nlp.cls_nlp_core.NLPCore.JSON_NAME_NUMBER: self._rule[0].rstrip(),
nlp.cls_nlp_core.NLPCore.JSON_NAME_LIST_NO: self.no_lists,
nlp.cls_nlp_core.NLPCore.JSON_NAME_NO_ENTRIES: len(entries),
nlp.cls_nlp_core.NLPCore.JSON_NAME_PAGE_NO_FROM: int(self._entries[0][0]) + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_PAGE_NO_TILL: int(self._entries[-1][0]) + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_ENTRIES: entries,
}
)
# }
entry = {
nlp.cls_nlp_core.NLPCore.JSON_NAME_NUMBER: self._rule[0].rstrip(),
nlp.cls_nlp_core.NLPCore.JSON_NAME_LIST_NO: self.no_lists,
nlp.cls_nlp_core.NLPCore.JSON_NAME_NO_ENTRIES: len(entries),
nlp.cls_nlp_core.NLPCore.JSON_NAME_PAGE_NO_FROM: int(self._entries[0][0]) + 1,
nlp.cls_nlp_core.NLPCore.JSON_NAME_PAGE_NO_TILL: int(self._entries[-1][0]) + 1,
}

if cfg.glob.setup.is_lt_list_number_file_incl_regexp:
entry[nlp.cls_nlp_core.NLPCore.JSON_NAME_REGEXP] = self._rule[-1]

entry[nlp.cls_nlp_core.NLPCore.JSON_NAME_ENTRIES] = entries

self._lists.append(entry)

self._reset_list()

Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,7 @@ def fxtr_before_any_test():
(cfg.cls_setup.Setup._DCR_CFG_LT_LIST_BULLET_MIN_ENTRIES, "2"),
(cfg.cls_setup.Setup._DCR_CFG_LT_LIST_BULLET_RULE_FILE, "data/line_type_list_bullet_rules_test.json"),
(cfg.cls_setup.Setup._DCR_CFG_LT_LIST_BULLET_TOLERANCE_LLX, "5"),
(cfg.cls_setup.Setup._DCR_CFG_LT_LIST_NUMBER_FILE_INCL_REGEXP, "false"),
(cfg.cls_setup.Setup._DCR_CFG_LT_LIST_NUMBER_MIN_ENTRIES, "2"),
(cfg.cls_setup.Setup._DCR_CFG_LT_LIST_NUMBER_RULE_FILE, "data/line_type_list_number_rules_test.json"),
(cfg.cls_setup.Setup._DCR_CFG_LT_LIST_NUMBER_TOLERANCE_LLX, "5"),
Expand Down
1 change: 1 addition & 0 deletions tests/test_nlp_cls_line_type_list_number.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def test_line_type_list_number_1(fxtr_rmdir_opt, fxtr_setup_empty_db_and_inbox):
(cfg.cls_setup.Setup._DCR_CFG_LT_HEADING_FILE_INCL_NO_CTX, "3"),
(cfg.cls_setup.Setup._DCR_CFG_LT_HEADING_FILE_INCL_REGEXP, "true"),
(cfg.cls_setup.Setup._DCR_CFG_LT_HEADING_RULE_FILE, "none"),
(cfg.cls_setup.Setup._DCR_CFG_LT_LIST_NUMBER_FILE_INCL_REGEXP, "true"),
(cfg.cls_setup.Setup._DCR_CFG_TETML_PAGE, "true"),
(cfg.cls_setup.Setup._DCR_CFG_TETML_WORD, "true"),
(cfg.cls_setup.Setup._DCR_CFG_VERBOSE_LT_HEADING, "true"),
Expand Down

0 comments on commit 48a084f

Please sign in to comment.