Skip to content
This repository has been archived by the owner on May 7, 2024. It is now read-only.

Commit

Permalink
Determining the headings.
Browse files Browse the repository at this point in the history
  • Loading branch information
walter-weinmann committed Jun 23, 2022
1 parent a1dc9cd commit 9762deb
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 105 deletions.
40 changes: 20 additions & 20 deletions data/line_type_heading_rules.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,26 @@
"(A)"
]
},
{
"name": "a.",
"isFirstToken": true,
"regexp": "[a-z]\\.$",
"functionIsAsc": "lowercase_letters",
"startValues": [
"a",
"a."
]
},
{
"name": "A.",
"isFirstToken": true,
"regexp": "[A-Z]\\.$",
"functionIsAsc": "uppercase_letters",
"startValues": [
"A",
"A."
]
},
{
"name": "999.",
"isFirstToken": true,
Expand All @@ -43,26 +63,6 @@
"functionIsAsc": "string_floats",
"startValues": []
},
{
"name": "a.",
"isFirstToken": true,
"regexp": "[a-z]\\.$",
"functionIsAsc": "lowercase_letters",
"startValues": [
"a",
"a."
]
},
{
"name": "A.",
"isFirstToken": true,
"regexp": "[A-Z]\\.$",
"functionIsAsc": "uppercase_letters",
"startValues": [
"A",
"A."
]
},
{
"name": "(rom)",
"isFirstToken": true,
Expand Down
54 changes: 27 additions & 27 deletions data/line_type_heading_rules_test.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,26 @@
"(A)"
]
},
{
"name": "a.",
"isFirstToken": true,
"regexp": "[a-z]\\.$",
"functionIsAsc": "lowercase_letters",
"startValues": [
"a",
"a."
]
},
{
"name": "A.",
"isFirstToken": true,
"regexp": "[A-Z]\\.$",
"functionIsAsc": "uppercase_letters",
"startValues": [
"A",
"A."
]
},
{
"name": "Article 999:",
"isFirstToken": false,
Expand Down Expand Up @@ -58,6 +78,13 @@
"isFirstToken": true,
"regexp": "Riders-\\d$",
"functionIsAsc": "string_integers",
"startValues": ["Riders-1"]
},
{
"name": "Section 999.999.",
"isFirstToken": false,
"regexp": "Section \\d+\\.\\d+\\.",
"functionIsAsc": "string_floats",
"startValues": []
},
{
Expand All @@ -78,40 +105,13 @@
"(1)"
]
},
{
"name": "Section 999.999.",
"isFirstToken": false,
"regexp": "Section \\d+\\.\\d+\\.",
"functionIsAsc": "string_floats",
"startValues": []
},
{
"name": "999.999",
"isFirstToken": true,
"regexp": "\\d+\\.\\d+\\.?$",
"functionIsAsc": "string_floats",
"startValues": []
},
{
"name": "a.",
"isFirstToken": true,
"regexp": "[a-z]\\.$",
"functionIsAsc": "lowercase_letters",
"startValues": [
"a",
"a."
]
},
{
"name": "A.",
"isFirstToken": true,
"regexp": "[A-Z]\\.$",
"functionIsAsc": "uppercase_letters",
"startValues": [
"A",
"A."
]
},
{
"name": "(rom)",
"isFirstToken": true,
Expand Down
67 changes: 9 additions & 58 deletions src/dcr/nlp/cls_line_type_heading.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,46 +259,18 @@ def _init_heading_rules(self) -> list[tuple[str, bool, str, collections.abc.Call
["(A)"],
),
(
"Article 999:",
False,
r"Article \d+:?",
self._is_asc_string_integers,
[],
),
(
"ARTICLE 999 -",
False,
r"ARTICLE \d+ -",
self._is_asc_string_integers,
[],
),
(
"ARTICLE XXX-YYY:",
False,
r"ARTICLE [A-Z]+(-[A-Z]+)?:",
self._is_asc_ignore,
[],
),
(
"EXHIBIT A",
False,
r"EXHIBIT [A-Z]$",
self._is_asc_strings,
[],
),
(
'EXHIBIT "A" ',
False,
r"EXHIBIT \u201c[A-Z]\u201d$",
self._is_asc_strings,
[],
"a.",
True,
r"[a-z]\.$",
self._is_asc_lowercase_letters,
["a", "a."],
),
(
"Riders-9",
"A.",
True,
r"Riders-\d$",
self._is_asc_string_integers,
[],
r"[A-Z]\.$",
self._is_asc_uppercase_letters,
["A", "A."],
),
(
"999.",
Expand All @@ -314,34 +286,13 @@ def _init_heading_rules(self) -> list[tuple[str, bool, str, collections.abc.Call
self._is_asc_string_integers,
["(1)"],
),
(
"Section 999.999.",
False,
r"Section \d+\.\d+\.",
self._is_asc_string_floats,
[],
),
(
"999.999",
True,
r"\d+\.\d+\.?$",
self._is_asc_string_floats,
[],
),
(
"a.",
True,
r"[a-z]\.$",
self._is_asc_lowercase_letters,
["a", "a."],
),
(
"A.",
True,
r"[A-Z]\.$",
self._is_asc_uppercase_letters,
["A", "A."],
),
(
"(rom)",
True,
Expand Down

0 comments on commit 9762deb

Please sign in to comment.