Skip to content
This repository has been archived by the owner on May 7, 2024. It is now read-only.

Commit

Permalink
Determining the numbered lists.
Browse files Browse the repository at this point in the history
  • Loading branch information
walter-weinmann committed Jul 7, 2022
1 parent b575104 commit db71763
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 4 deletions.
80 changes: 80 additions & 0 deletions data/lt_export_rule_list_number.json
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,14 @@
"1)"
]
},
{
"name": "999.",
"regexp": "\\d+\\.",
"functionIsAsc": "string_integers",
"startValues": [
"1."
]
},
{
"name": "999.999",
"regexp": "\\d+\\.\\d{1,3}$",
Expand All @@ -107,6 +115,14 @@
"A)"
]
},
{
"name": "A.",
"regexp": "[A-Z]\\.",
"functionIsAsc": "uppercase_letters",
"startValues": [
"A."
]
},
{
"name": "ROM)",
"regexp": "M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\)$",
Expand All @@ -115,6 +131,14 @@
"I)"
]
},
{
"name": "ROM.",
"regexp": "M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\.",
"functionIsAsc": "romans",
"startValues": [
"I."
]
},
{
"name": "a)",
"regexp": "[a-z]\\)$",
Expand All @@ -123,13 +147,69 @@
"a)"
]
},
{
"name": "a.",
"regexp": "[a-z]\\.",
"functionIsAsc": "lowercase_letters",
"startValues": [
"a."
]
},
{
"name": "rom)",
"regexp": "m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\\)$",
"functionIsAsc": "romans",
"startValues": [
"i)"
]
},
{
"name": "rom.",
"regexp": "m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\\.",
"functionIsAsc": "romans",
"startValues": [
"i."
]
},
{
"name": "999",
"regexp": "\\d+[ ]+[A-Z][a-zA-Z]+",
"functionIsAsc": "string_integers_token",
"startValues": [
"1 "
]
},
{
"name": "A",
"regexp": "[A-Z][ ]+[A-Z][a-zA-Z]+",
"functionIsAsc": "uppercase_letters_token",
"startValues": [
"A "
]
},
{
"name": "ROM",
"regexp": "M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})[ ]+[A-Z][a-zA-Z]+",
"functionIsAsc": "romans_token",
"startValues": [
"I "
]
},
{
"name": "a",
"regexp": "[a-z][ ]+[A-Z][a-zA-Z]+",
"functionIsAsc": "lowercase_letters_token",
"startValues": [
"a "
]
},
{
"name": "rom",
"regexp": "m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})[ ]+[A-Z][a-zA-Z]+",
"functionIsAsc": "romans_token",
"startValues": [
"i "
]
}
]
}
80 changes: 80 additions & 0 deletions data/lt_export_rule_list_number_test.json
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,14 @@
"1)"
]
},
{
"name": "999.",
"regexp": "\\d+\\.",
"functionIsAsc": "string_integers",
"startValues": [
"1."
]
},
{
"name": "999.999",
"regexp": "\\d+\\.\\d{1,3}$",
Expand All @@ -107,6 +115,14 @@
"A)"
]
},
{
"name": "A.",
"regexp": "[A-Z]\\.",
"functionIsAsc": "uppercase_letters",
"startValues": [
"A."
]
},
{
"name": "ROM)",
"regexp": "M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\)$",
Expand All @@ -115,6 +131,14 @@
"I)"
]
},
{
"name": "ROM.",
"regexp": "M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\.",
"functionIsAsc": "romans",
"startValues": [
"I."
]
},
{
"name": "a)",
"regexp": "[a-z]\\)$",
Expand All @@ -123,13 +147,69 @@
"a)"
]
},
{
"name": "a.",
"regexp": "[a-z]\\.",
"functionIsAsc": "lowercase_letters",
"startValues": [
"a."
]
},
{
"name": "rom)",
"regexp": "m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\\)$",
"functionIsAsc": "romans",
"startValues": [
"i)"
]
},
{
"name": "rom.",
"regexp": "m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\\.",
"functionIsAsc": "romans",
"startValues": [
"i."
]
},
{
"name": "999",
"regexp": "\\d+[ ]+[A-Z][a-zA-Z]+",
"functionIsAsc": "string_integers_token",
"startValues": [
"1 "
]
},
{
"name": "A",
"regexp": "[A-Z][ ]+[A-Z][a-zA-Z]+",
"functionIsAsc": "uppercase_letters_token",
"startValues": [
"A "
]
},
{
"name": "ROM",
"regexp": "M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})[ ]+[A-Z][a-zA-Z]+",
"functionIsAsc": "romans_token",
"startValues": [
"I "
]
},
{
"name": "a",
"regexp": "[a-z][ ]+[A-Z][a-zA-Z]+",
"functionIsAsc": "lowercase_letters_token",
"startValues": [
"a "
]
},
{
"name": "rom",
"regexp": "m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})[ ]+[A-Z][a-zA-Z]+",
"functionIsAsc": "romans_token",
"startValues": [
"i "
]
}
]
}
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@ directory_inbox = data/inbox_dev
directory_inbox_accepted = data/inbox_dev_accepted
directory_inbox_rejected = data/inbox_dev_rejected
lt_heading_file_incl_regexp = true
lt_heading_rule_file = data/lt_export_rule_heading_test.json
lt_list_bullet_rule_file = data/lt_export_rule_list_bullet_test.json
lt_list_number_file_incl_regexp = true
lt_list_number_rule_file = data/lt_export_rule_list_number_test.json
tetml_page = true
tetml_word = true
verbose_lt_heading = true
Expand Down
2 changes: 1 addition & 1 deletion src/dcr/nlp/cls_line_type_list_number.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def _load_rules_from_json(
with open(lt_list_number_rule_file, "r", encoding=cfg.glob.FILE_ENCODING_DEFAULT) as file_handle:
json_data = json.load(file_handle)

for rule in json_data[nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_TYPE_HEADING_RULES]:
for rule in json_data[nlp.cls_nlp_core.NLPCore.JSON_NAME_LINE_TYPE_LIST_NUMBER_RULES]:
rules.append(
(
rule[nlp.cls_nlp_core.NLPCore.JSON_NAME_NAME],
Expand Down
5 changes: 2 additions & 3 deletions src/dcr/nlp/cls_nlp_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,13 +698,12 @@ def get_lt_rules_default_list_number() -> list[tuple[str, str, collections.abc.C

for (
rule_name,
is_first_token,
_,
regexp_str,
function_is_asc,
start_values,
) in NLPCore._get_lt_rules_default_heading_list_number():
if is_first_token:
rules.append((rule_name, regexp_str, function_is_asc, start_values))
rules.append((rule_name, regexp_str, function_is_asc, start_values))

return rules

Expand Down

0 comments on commit db71763

Please sign in to comment.