Skip to content
This repository has been archived by the owner on May 7, 2024. It is now read-only.

Commit

Permalink
Determining the headings.
Browse files Browse the repository at this point in the history
  • Loading branch information
walter-weinmann committed Jun 24, 2022
1 parent 0758497 commit 866fc36
Show file tree
Hide file tree
Showing 9 changed files with 550 additions and 1,625 deletions.
168 changes: 85 additions & 83 deletions docs/running_configuration.md

Large diffs are not rendered by default.

22 changes: 10 additions & 12 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@ toc_last_page = 5
toc_min_entries = 5
tokenize_2_database = true
tokenize_2_jsonfile = true
tokenize_footers = false
tokenize_headers = false
tokenize_toc = false
verbose = true
verbose_line_type_headers_footers = false
verbose_line_type_heading = false
Expand Down Expand Up @@ -84,9 +81,6 @@ tetml_page = true
tetml_word = true
tokenize_2_database = true
tokenize_2_jsonfile = true
tokenize_footers = false
tokenize_headers = false
tokenize_toc = false
verbose = true
verbose_line_type_headers_footers = false
verbose_line_type_heading = false
Expand Down Expand Up @@ -134,9 +128,6 @@ toc_last_page = 5
toc_min_entries = 5
tokenize_2_database = true
tokenize_2_jsonfile = false
tokenize_footers = false
tokenize_headers = false
tokenize_toc = false
verbose = true
verbose_line_type_headers_footers = false
verbose_line_type_heading = false
Expand All @@ -146,6 +137,13 @@ verbose_parser = none
[dcr.spacy]
spacy_ignore_bracket = true
spacy_ignore_left_punct = true
spacy_ignore_line_type_footer = true
spacy_ignore_line_type_header = true
spacy_ignore_line_type_heading = false
spacy_ignore_line_type_list_bulleted = false
spacy_ignore_line_type_list_numbered = false
spacy_ignore_line_type_table = false
spacy_ignore_line_type_toc = true
spacy_ignore_punct = true
spacy_ignore_quote = true
spacy_ignore_right_punct = true
Expand All @@ -154,7 +152,7 @@ spacy_ignore_stop = true
spacy_tkn_attr_cluster = false
spacy_tkn_attr_dep_ = false
spacy_tkn_attr_doc = false
spacy_tkn_attr_ent_iob_ = true
spacy_tkn_attr_ent_iob_ = false
spacy_tkn_attr_ent_kb_id_ = false
spacy_tkn_attr_ent_type_ = true
spacy_tkn_attr_head = false
Expand All @@ -171,8 +169,8 @@ spacy_tkn_attr_is_oov = true
spacy_tkn_attr_is_punct = true
spacy_tkn_attr_is_quote = false
spacy_tkn_attr_is_right_punct = false
spacy_tkn_attr_is_sent_end = true
spacy_tkn_attr_is_sent_start = true
spacy_tkn_attr_is_sent_end = false
spacy_tkn_attr_is_sent_start = false
spacy_tkn_attr_is_space = false
spacy_tkn_attr_is_stop = true
spacy_tkn_attr_is_title = true
Expand Down
1,284 changes: 433 additions & 851 deletions src/dcr/cfg/cls_setup.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/dcr/db/cls_db_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def _connect_db_user(self) -> tuple[Engine, MetaData]:
cfg.glob.setup.db_connection_prefix
+ cfg.glob.setup.db_host
+ ":"
+ cfg.glob.setup.db_connection_port
+ str(cfg.glob.setup.db_connection_port)
+ "/"
+ self._db_current_database
+ "?user="
Expand Down
14 changes: 11 additions & 3 deletions src/dcr/nlp/cls_tokenizer_spacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,11 +697,19 @@ def _process_page(self) -> None:

if (
line_type == cfg.glob.document.DOCUMENT_LINE_TYPE_FOOTER # pylint: disable=too-many-boolean-expressions
and not cfg.glob.setup.is_tokenize_footers
and cfg.glob.setup.is_spacy_ignore_line_type_footer
or line_type == cfg.glob.document.DOCUMENT_LINE_TYPE_HEADER
and not cfg.glob.setup.is_tokenize_headers
and cfg.glob.setup.is_spacy_ignore_line_type_header
or line_type == cfg.glob.document.DOCUMENT_LINE_TYPE_HEADING
and cfg.glob.setup.is_spacy_ignore_line_type_heading
or line_type == cfg.glob.document.DOCUMENT_LINE_TYPE_LIST_BULLETED
and cfg.glob.setup.is_spacy_ignore_line_type_list_bulleted
or line_type == cfg.glob.document.DOCUMENT_LINE_TYPE_LIST_NUMBERED
and cfg.glob.setup.is_spacy_ignore_line_type_list_numbered
or line_type == cfg.glob.document.DOCUMENT_LINE_TYPE_TABLE
and cfg.glob.setup.is_spacy_ignore_line_type_table
or line_type == cfg.glob.document.DOCUMENT_LINE_TYPE_TOC
and not cfg.glob.setup.is_tokenize_toc
and cfg.glob.setup.is_spacy_ignore_line_type_toc
):
continue

Expand Down
14 changes: 8 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,15 +548,18 @@ def fxtr_before_any_test():
(cfg.cls_setup.Setup._DCR_CFG_DIRECTORY_INBOX, "data/inbox_test"),
(cfg.cls_setup.Setup._DCR_CFG_DIRECTORY_INBOX_ACCEPTED, "data/inbox_test_accepted"),
(cfg.cls_setup.Setup._DCR_CFG_DIRECTORY_INBOX_REJECTED, "data/inbox_test_rejected"),
(cfg.cls_setup.Setup._DCR_CFG_HEADING_TOC_CREATE, "true"),
(cfg.cls_setup.Setup._DCR_CFG_HEADING_TOC_INCL_NO_CTX, "3"),
(cfg.cls_setup.Setup._DCR_CFG_HEADING_TOC_INCL_REGEXP, "false"),
(cfg.cls_setup.Setup._DCR_CFG_DOC_ID_IN_FILE_NAME, "after"),
(cfg.cls_setup.Setup._DCR_CFG_HEADING_MAX_LEVEL, "3"),
(cfg.cls_setup.Setup._DCR_CFG_HEADING_MIN_PAGES, "2"),
(cfg.cls_setup.Setup._DCR_CFG_HEADING_RULE_FILE, "data/line_type_heading_rules_test.json"),
(cfg.cls_setup.Setup._DCR_CFG_HEADING_TOC_CREATE, "true"),
(cfg.cls_setup.Setup._DCR_CFG_HEADING_TOC_INCL_NO_CTX, "3"),
(cfg.cls_setup.Setup._DCR_CFG_HEADING_TOC_INCL_REGEXP, "false"),
(cfg.cls_setup.Setup._DCR_CFG_HEADING_TOLERANCE_X, "5"),
(cfg.cls_setup.Setup._DCR_CFG_IGNORE_DUPLICATES, "false"),
(cfg.cls_setup.Setup._DCR_CFG_INITIAL_DATABASE_DATA, "data/initial_database_data_test.json"),
(cfg.cls_setup.Setup._DCR_CFG_JSON_INDENT, "4"),
(cfg.cls_setup.Setup._DCR_CFG_JSON_SORT_KEYS, "false"),
(cfg.cls_setup.Setup._DCR_CFG_LINE_FOOTER_MAX_DISTANCE, "3"),
(cfg.cls_setup.Setup._DCR_CFG_LINE_FOOTER_MAX_LINES, "3"),
(cfg.cls_setup.Setup._DCR_CFG_LINE_HEADER_MAX_DISTANCE, "3"),
Expand All @@ -565,11 +568,10 @@ def fxtr_before_any_test():
(cfg.cls_setup.Setup._DCR_CFG_TESSERACT_TIMEOUT, "30"),
(cfg.cls_setup.Setup._DCR_CFG_TETML_PAGE, "false"),
(cfg.cls_setup.Setup._DCR_CFG_TETML_WORD, "false"),
(cfg.cls_setup.Setup._DCR_CFG_TOC_LAST_PAGE, "5"),
(cfg.cls_setup.Setup._DCR_CFG_TOC_MIN_ENTRIES, "5"),
(cfg.cls_setup.Setup._DCR_CFG_TOKENIZE_2_DATABASE, "true"),
(cfg.cls_setup.Setup._DCR_CFG_TOKENIZE_2_JSONFILE, "false"),
(cfg.cls_setup.Setup._DCR_CFG_TOKENIZE_FOOTERS, "false"),
(cfg.cls_setup.Setup._DCR_CFG_TOKENIZE_HEADERS, "false"),
(cfg.cls_setup.Setup._DCR_CFG_TOKENIZE_TOC, "false"),
(cfg.cls_setup.Setup._DCR_CFG_VERBOSE, "true"),
(cfg.cls_setup.Setup._DCR_CFG_VERBOSE_LINE_TYPE_HEADERS_FOOTERS, "false"),
(cfg.cls_setup.Setup._DCR_CFG_VERBOSE_LINE_TYPE_HEADING, "false"),
Expand Down
Loading

0 comments on commit 866fc36

Please sign in to comment.