/
datamodels.py
299 lines (251 loc) · 11.4 KB
/
datamodels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#! -*- coding: utf-8 -*-
# normalize module #
from JapaneseTokenizer.common.text_preprocess import normalize_text, denormalize_text
# datemodels #
from MeCab import Node
# typing #
from typing import List, Union, Any, Tuple, Dict, Callable, Optional
from future.utils import text_type, string_types
import sys
import six
__author__ = 'kensuke-mi'
python_version = sys.version_info
def __is_sotpwords(token, stopwords):
"""This function filters out stopwords. If token is in stopwords list, return True; else return False
"""
if token in stopwords:
return True
else:
return False
def __is_valid_pos(pos_tuple, valid_pos):
# type: (Tuple[text_type,...],List[Tuple[text_type,...]])->bool
"""This function checks token's pos is with in POS set that user specified.
If token meets all conditions, Return True; else return False
"""
def is_valid_pos(valid_pos_tuple):
# type: (Tuple[text_type,...])->bool
length_valid_pos_tuple = len(valid_pos_tuple)
if valid_pos_tuple == pos_tuple[:length_valid_pos_tuple]:
return True
else:
return False
seq_bool_flags = [is_valid_pos(valid_pos_tuple) for valid_pos_tuple in valid_pos]
if True in set(seq_bool_flags):
return True
else:
return False
def filter_words(tokenized_obj, valid_pos, stopwords, check_field_name='stem'):
# type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type],text_type) -> FilteredObject
"""This function filter token that user don't want to take.
Condition is stopword and pos.
* Input
- valid_pos
- List of Tuple which has POS element to keep.
- Keep in your mind, each tokenizer has different POS structure.
>>> [('名詞', '固有名詞'), ('動詞', )]
- stopwords
- List of str, which you'd like to remove
>>> ['残念', '今日']
"""
assert isinstance(tokenized_obj, TokenizedSenetence)
assert isinstance(valid_pos, list)
assert isinstance(stopwords, list)
filtered_tokens = []
for token_obj in tokenized_obj.tokenized_objects:
assert isinstance(token_obj, TokenizedResult)
if check_field_name=='stem':
res_stopwords = __is_sotpwords(token_obj.word_stem, stopwords)
else:
res_stopwords = __is_sotpwords(token_obj.word_surface, stopwords)
res_pos_condition = __is_valid_pos(token_obj.tuple_pos, valid_pos)
# case1: only pos filtering is ON
if valid_pos != [] and stopwords == []:
if res_pos_condition: filtered_tokens.append(token_obj)
# case2: only stopwords filtering is ON
if valid_pos == [] and stopwords != []:
if res_stopwords is False: filtered_tokens.append(token_obj)
# case3: both condition is ON
if valid_pos != [] and stopwords != []:
if res_stopwords is False and res_pos_condition: filtered_tokens.append(token_obj)
filtered_object = FilteredObject(
sentence=tokenized_obj.sentence,
tokenized_objects=filtered_tokens,
pos_condition=valid_pos,
stopwords=stopwords
)
return filtered_object
class TokenizedResult(object):
def __init__(self,
node_obj,
tuple_pos,
word_stem,
word_surface,
is_feature=True,
is_surface=False,
misc_info=None,
analyzed_line=None):
# type: (Optional[Node], Tuple[text_type, ...], str, str, bool, bool, Optional[Dict[str, Any]], str)->None
assert isinstance(node_obj, (Node, type(None)))
assert isinstance(tuple_pos, (string_types, tuple))
assert isinstance(word_stem, (string_types))
assert isinstance(word_surface, text_type)
assert isinstance(misc_info, (type(None), dict))
self.node_obj = node_obj
self.word_stem = word_stem
self.word_surface = word_surface
self.is_surface = is_surface
self.is_feature = is_feature
self.misc_info = misc_info
self.analyzed_line = analyzed_line
if isinstance(tuple_pos, tuple):
self.tuple_pos = tuple_pos
elif isinstance(tuple_pos, string_types):
self.tuple_pos = ('*', )
else:
raise Exception('Error while parsing feature object. {}'.format(tuple_pos))
class TokenizedSenetence(object):
def __init__(self, sentence, tokenized_objects, string_encoding='utf-8'):
# type: (text_type, List[TokenizedResult], text_type)->None
"""* Parameters
- sentence: sentence
- tokenized_objects: list of TokenizedResult object
- string_encoding: Encoding type of string type. This option is used only under python2.x
"""
assert isinstance(sentence, text_type)
assert isinstance(tokenized_objects, list)
self.sentence = sentence
self.tokenized_objects = tokenized_objects
self.string_encoding = string_encoding
def __extend_token_object(self, token_object,
is_denormalize=True,
func_denormalizer=denormalize_text):
# type: (TokenizedResult,bool,Callable[[str],str])->Tuple
"""This method creates dict object from token object.
"""
assert isinstance(token_object, TokenizedResult)
if is_denormalize:
if token_object.is_feature == True:
if token_object.is_surface == True:
token = (func_denormalizer(token_object.word_surface), token_object.tuple_pos)
else:
token = (func_denormalizer(token_object.word_stem), token_object.tuple_pos)
else:
if token_object.is_surface == True:
token = func_denormalizer(token_object.word_surface)
else:
token = func_denormalizer(token_object.word_stem)
else:
if token_object.is_feature == True:
if token_object.is_surface == True:
token = (token_object.word_surface, token_object.tuple_pos)
else:
token = (token_object.word_stem, token_object.tuple_pos)
else:
if token_object.is_surface == True:
token = token_object.word_surface
else:
token = token_object.word_stem
return token
def convert_list_object(self,
is_denormalize=True,
func_denormalizer=denormalize_text):
# type: (bool,Callable[[str],str])->List[Union[str, Tuple[str,...]]]
"""* What you can do
- You extract string object from TokenizedResult object
* Args
- is_denormalize: boolen object. True; it makes denormalize string
- func_denormalizer: callable object. de-normalization function.
"""
sentence_in_list_obj = [
self.__extend_token_object(token_object,is_denormalize,func_denormalizer)
for token_object
in self.tokenized_objects
]
return sentence_in_list_obj
def __convert_string_type(self, p_c_tuple):
# type: (Tuple[text_type,...])->Tuple[text_type]
"""* What you can do
- it normalizes string types into str
"""
if not isinstance(p_c_tuple, tuple):
raise Exception('Pos condition expects tuple of string. However = {}'.format(p_c_tuple))
converted = [text_type] * len(p_c_tuple)
for i, pos_element in enumerate(p_c_tuple):
if six.PY2 and isinstance(pos_element, str):
"""str into unicode if python2.x"""
converted[i] = pos_element.decode(self.string_encoding)
elif six.PY2 and isinstance(pos_element, text_type):
converted[i] = pos_element
elif six.PY3:
converted[i] = pos_element
else:
raise Exception()
return tuple(converted)
def __check_pos_condition(self, pos_condistion):
# type: (List[Tuple[text_type, ...]])->List[Tuple[text_type, ...]]
"""* What you can do
- Check your pos condition
- It converts character type into unicode if python version is 2.x
"""
assert isinstance(pos_condistion, list)
return [self.__convert_string_type(p_c_tuple) for p_c_tuple in pos_condistion]
def filter(self,
pos_condition=None,
stopwords=None,
is_normalize=True,
func_normalizer=normalize_text,
check_field_name='stem'):
# type: (List[Tuple[text_type,...]], List[text_type], bool, Callable[[text_type], text_type],text_type)->FilteredObject
"""* What you can do
- It filters out token which does NOT meet the conditions (stopwords & part-of-speech tag)
- Under python2.x, pos_condition & stopwords are converted into unicode type.
* Parameters
- pos_condition: list of part-of-speech(pos) condition. The pos condition is tuple is variable length.
You can specify hierarchical structure of pos condition with variable tuple.
The hierarchy of pos condition follows definition of dictionary.
- For example, in mecab you can take words with 名詞 if ('名詞',)
- For example, in mecab you can take words with 名詞-固有名詞 if ('名詞', '固有名詞')
- stopwords: list of word which you would like to remove
- is_normalize: Boolean flag for normalize stopwords.
- func_normalizer: Function object for normalization. The function object must be the same one as when you use tokenize.
- check_field_name: Put field name to check if stopword or NOT. Kytea does not have stem form of word, put 'surface' instead.
* Example
>>> pos_condition = [('名詞', '一般'), ('形容詞', '自立'), ('助詞', '格助詞', '一般')]
>>> stopwords = ['これ', 'それ']
"""
assert isinstance(pos_condition, (type(None), list))
assert isinstance(stopwords, (type(None), list))
if stopwords is None:
s_words = []
elif six.PY2 and all((isinstance(s, str) for s in stopwords)):
"""under python2.x, from str into unicode"""
if is_normalize:
s_words = [func_normalizer(s.decode(self.string_encoding)) for s in stopwords]
else:
s_words = [s.decode(self.string_encoding) for s in stopwords]
else:
if is_normalize:
s_words = [func_normalizer(s) for s in stopwords]
else:
s_words = stopwords
if pos_condition is None:
p_condition = []
else:
p_condition = self.__check_pos_condition(pos_condition)
filtered_object = filter_words(
tokenized_obj=self,
valid_pos=p_condition,
stopwords=s_words,
check_field_name=check_field_name
)
assert isinstance(filtered_object, FilteredObject)
return filtered_object
class FilteredObject(TokenizedSenetence):
def __init__(self, sentence, tokenized_objects, pos_condition, stopwords):
# type: (str, List[TokenizedResult], List[str, ...], List[str])->None
super(FilteredObject, self).__init__(
sentence=sentence,
tokenized_objects=tokenized_objects
)
self.pos_condition=pos_condition
self.stopwords=stopwords