/
address_features.py
181 lines (133 loc) · 5.28 KB
/
address_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Features extraction for addresses detecting classifier.
"""
import json
import os
import re
import string
from datetime import datetime
from email.utils import parseaddr
from typing import List
import nltk
import pycountry
import us
from dateutil import parser as dateparser
__author__ = "ContraxSuite, LLC; LexPredict, LLC"
__copyright__ = "Copyright 2015-2018, ContraxSuite, LLC"
__license__ = "https://github.com/LexPredict/lexpredict-lexnlp/blob/master/LICENSE"
__version__ = "0.1.6"
__maintainer__ = "LexPredict, LLC"
__email__ = "support@contraxsuite.com"
cwd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
def _norm(s: str) -> str:
return s.upper()
def _load_set_from_lines(fn, normalize: bool = False):
with open(os.path.join(cwd, fn), 'r') as f:
if normalize:
return {_norm(l.strip()) for l in f.readlines()}
else:
return {l.strip() for l in f.readlines()}
STREET_SUFFIXES = _load_set_from_lines('street_suffixes.csv', normalize=True)
BUILDING_SUFFIXES = _load_set_from_lines('building_suffixes.csv', normalize=True)
STREET_DIRECTIONS = _load_set_from_lines('street_directions.csv', normalize=True)
DATE_MIN = datetime(1600, 1, 1)
DATE_MAX = datetime(2300, 1, 1)
def is_datetime(word: str) -> bool:
if len(word) < 6:
return False
try:
dt = dateparser.parse(word)
if dt < DATE_MIN or dt > DATE_MAX:
return False
return True
except ValueError:
return False
except OverflowError:
return False
URL_REGEX = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def is_url(word: str) -> bool:
return URL_REGEX.fullmatch(word) is not None
def is_single_initial(word: str) -> bool:
return len(word) == 2 and word[0].isupper() and word[1] == '.'
def is_uppercase_char(word: str) -> bool:
return len(word) == 1 and word[0].isupper()
def is_lowercase_char(word: str) -> bool:
return len(word) == 1 and word[0].islower()
def is_email(word: str) -> bool:
return bool('@' in word and parseaddr(word))
ZIP_CODE = re.compile(r'^[0-9]{5}(?:-[0-9]{4})?$')
def is_zip_code(s: str) -> bool:
return bool(ZIP_CODE.fullmatch(s))
def build_country_words():
words = set()
for c in pycountry.countries:
if hasattr(c, 'alpha_2'):
words.update(_norm(c.alpha_2).split(' '))
if hasattr(c, 'alpha_3'):
words.update(_norm(c.alpha_3).split(' '))
if hasattr(c, 'name'):
words.update(_norm(c.name).split(' '))
if hasattr(c, 'official_name'):
words.update(_norm(c.official_name).split(' '))
words.discard('')
words.discard('AND')
words.discard('OF')
return words
def build_provinces_words():
res = set()
for province in _load_set_from_lines('provinces.txt', normalize=True):
res.update(province.split(' '))
res.discard('')
res.discard(' ')
res.add('OBLAST')
return res
POS_TAG_SET_INDEX_FN = os.path.join(cwd, 'nltk_pos_tag_indexes.json')
POS_TAG_SET_INDEX = json.load(open(POS_TAG_SET_INDEX_FN, 'r'))
COUNTRY_WORDS = build_country_words()
PROVINCES_WORDS = build_provinces_words()
FEATURE_WORD_LEN = 23
ZERO_FEATURES = [0 for _i in range(FEATURE_WORD_LEN)]
def get_word_features(word: str, part_of_speech: str) -> List[int]:
if not word:
return ZERO_FEATURES
word_norm = _norm(word)
word_no_dots = word_norm.strip('.')
res = [
POS_TAG_SET_INDEX.get(part_of_speech or '') or 0, # part of speech
int(word.istitle()), # init_cap
int(word.isupper()), # all_caps
int(any(ch.isdigit() for ch in word)), # contains_digits
int(all(ch.isdigit() for ch in word)), # all_digits
int(all(ch == '.' or ch.isupper() for ch in word)), # acronym
int(all(ch in string.punctuation for ch in word)), # punctuation
int(is_datetime(word)), # datetime
int(is_url(word)), # url
int('\'' in word), # contraction
int(is_single_initial(word)),
int(is_uppercase_char(word)), # uppercase_char
int(is_lowercase_char(word)), # lowercase_char
int('-' in word), # contains_dash
int(len(word) > 5 and all(ch.isdigit() or ch in ' -()' for ch in word)), # phone
int(is_email(word)), # email
int(bool(us.states.lookup(word))), # us_state
int(word_no_dots in STREET_SUFFIXES), # street_suffix
int(word_no_dots in BUILDING_SUFFIXES), # building
int(word_norm in STREET_DIRECTIONS), # street directions
int(is_zip_code(word)),
int(word_norm in COUNTRY_WORDS),
int(word_norm in PROVINCES_WORDS)]
return res
def prepare_pos_tagset_index_file():
# Building tagset right from nltk requires some manual manipulations on file downloading.
# Caching them in a json file.
tagset = {k: i + 1 for i, k in
enumerate(sorted(nltk.help.load('help/tagsets/upenn_tagset.pickle').keys()))}
json.dump(tagset, open(POS_TAG_SET_INDEX_FN, 'w'))
if __name__ == '__main__':
prepare_pos_tagset_index_file()