Skip to content

Commit

Permalink
Remove WordPreprocessor
Browse files Browse the repository at this point in the history
  • Loading branch information
Hironsan committed Mar 2, 2018
1 parent ad03a9b commit a90b589
Showing 1 changed file with 10 additions and 134 deletions.
144 changes: 10 additions & 134 deletions anago/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,15 @@ def inverse_transform(self, docs):
def _get_char_ids(self, word):
return [self.char_dic.get(c, self.char_dic[UNK]) for c in word]

def save(self, file_path):
joblib.dump(self, file_path)

@classmethod
def load(cls, file_path):
p = joblib.load(file_path)

return p


class DynamicPreprocessor(BaseEstimator, TransformerMixin):

Expand All @@ -109,146 +118,13 @@ def transform(self, X, y=None):

return sents, y


class WordPreprocessor(BaseEstimator, TransformerMixin):

def __init__(self,
lowercase=True,
num_norm=True,
char_feature=True,
vocab_init=None,
padding=True,
return_lengths=True):

self.lowercase = lowercase
self.num_norm = num_norm
self.char_feature = char_feature
self.padding = padding
self.return_lengths = return_lengths
self.vocab_word = None
self.vocab_char = None
self.vocab_tag = None
self.vocab_init = vocab_init or {}

def fit(self, X, y):
words = {PAD: 0, UNK: 1}
chars = {PAD: 0, UNK: 1}
tags = {PAD: 0}

for w in set(itertools.chain(*X)) | set(self.vocab_init):
if not self.char_feature:
continue
for c in w:
if c not in chars:
chars[c] = len(chars)

w = self._lower(w)
w = self._normalize_num(w)
if w not in words:
words[w] = len(words)

for t in itertools.chain(*y):
if t not in tags:
tags[t] = len(tags)

self.vocab_word = words
self.vocab_char = chars
self.vocab_tag = tags

return self

def transform(self, X, y=None):
"""transforms input(s)
Args:
X: list of list of words
y: list of list of tags
Returns:
numpy array: sentences
numpy array: tags
Examples:
>>> X = [['President', 'Obama', 'is', 'speaking']]
>>> print(self.transform(X))
[
[
[1999, 1037, 22123, 48388], # word ids
],
[
[
[1, 2, 3, 4, 5, 6, 7, 8, 9], # list of char ids
[1, 2, 3, 4, 5, 0, 0, 0, 0], # 0 is a pad
[1, 2, 0, 0, 0, 0, 0, 0, 0],
[1, 2, 3, 4, 5, 6, 7, 8, 0]
]
]
]
"""
words = []
chars = []
lengths = []
for sent in X:
word_ids = []
char_ids = []
lengths.append(len(sent))
for w in sent:
if self.char_feature:
char_ids.append(self._get_char_ids(w))

w = self._lower(w)
w = self._normalize_num(w)
if w in self.vocab_word:
word_id = self.vocab_word[w]
else:
word_id = self.vocab_word[UNK]
word_ids.append(word_id)

words.append(word_ids)
if self.char_feature:
chars.append(char_ids)

if y is not None:
y = [[self.vocab_tag[t] for t in sent] for sent in y]

if self.padding:
words = pad_sequences(words, padding='post')
chars = pad_char(chars)
y = pad_sequences(y, padding='post')
y = to_categorical(y, len(self.vocab_tag))
sents = [words, chars]
else:
sents = [words, chars]

if self.return_lengths:
lengths = np.asarray(lengths, dtype=np.int32)
lengths = lengths.reshape((lengths.shape[0], 1))
sents.append(lengths)

return (sents, y) if y is not None else sents

def inverse_transform(self, y):
indice_tag = {i: t for t, i in self.vocab_tag.items()}
return [indice_tag[y_] for y_ in y]

def _get_char_ids(self, word):
return [self.vocab_char.get(c, self.vocab_char[UNK]) for c in word]

def _lower(self, word):
return word.lower() if self.lowercase else word

def _normalize_num(self, word):
if self.num_norm:
return re.sub(r'[0-90123456789]', r'0', word)
else:
return word

def save(self, file_path):
joblib.dump(self, file_path)

@classmethod
def load(cls, file_path):
p = joblib.load(file_path)

return p


Expand Down

0 comments on commit a90b589

Please sign in to comment.