Remove WordPreprocessor

Hironsan · Mar 2, 2018 · a90b589 · a90b589
1 parent ad03a9b
commit a90b589
Showing 1 changed file with 10 additions and 134 deletions.
diff --git a/anago/preprocess.py b/anago/preprocess.py
@@ -90,6 +90,15 @@ def inverse_transform(self, docs):
     def _get_char_ids(self, word):
         return [self.char_dic.get(c, self.char_dic[UNK]) for c in word]
 
+    def save(self, file_path):
+        joblib.dump(self, file_path)
+
+    @classmethod
+    def load(cls, file_path):
+        p = joblib.load(file_path)
+
+        return p
+
 
 class DynamicPreprocessor(BaseEstimator, TransformerMixin):
 
@@ -109,146 +118,13 @@ def transform(self, X, y=None):
 
         return sents, y
 
-
-class WordPreprocessor(BaseEstimator, TransformerMixin):
-
-    def __init__(self,
-                 lowercase=True,
-                 num_norm=True,
-                 char_feature=True,
-                 vocab_init=None,
-                 padding=True,
-                 return_lengths=True):
-
-        self.lowercase = lowercase
-        self.num_norm = num_norm
-        self.char_feature = char_feature
-        self.padding = padding
-        self.return_lengths = return_lengths
-        self.vocab_word = None
-        self.vocab_char = None
-        self.vocab_tag  = None
-        self.vocab_init = vocab_init or {}
-
-    def fit(self, X, y):
-        words = {PAD: 0, UNK: 1}
-        chars = {PAD: 0, UNK: 1}
-        tags  = {PAD: 0}
-
-        for w in set(itertools.chain(*X)) | set(self.vocab_init):
-            if not self.char_feature:
-                continue
-            for c in w:
-                if c not in chars:
-                    chars[c] = len(chars)
-
-            w = self._lower(w)
-            w = self._normalize_num(w)
-            if w not in words:
-                words[w] = len(words)
-
-        for t in itertools.chain(*y):
-            if t not in tags:
-                tags[t] = len(tags)
-
-        self.vocab_word = words
-        self.vocab_char = chars
-        self.vocab_tag  = tags
-
-        return self
-
-    def transform(self, X, y=None):
-        """transforms input(s)
-
-        Args:
-            X: list of list of words
-            y: list of list of tags
-
-        Returns:
-            numpy array: sentences
-            numpy array: tags
-
-        Examples:
-            >>> X = [['President', 'Obama', 'is', 'speaking']]
-            >>> print(self.transform(X))
-            [
-                [
-                    [1999, 1037, 22123, 48388],       # word ids
-                ],
-                [
-                    [
-                        [1, 2, 3, 4, 5, 6, 7, 8, 9],  # list of char ids
-                        [1, 2, 3, 4, 5, 0, 0, 0, 0],  # 0 is a pad
-                        [1, 2, 0, 0, 0, 0, 0, 0, 0],
-                        [1, 2, 3, 4, 5, 6, 7, 8, 0]
-                    ]
-                ]
-            ]
-        """
-        words = []
-        chars = []
-        lengths = []
-        for sent in X:
-            word_ids = []
-            char_ids = []
-            lengths.append(len(sent))
-            for w in sent:
-                if self.char_feature:
-                    char_ids.append(self._get_char_ids(w))
-
-                w = self._lower(w)
-                w = self._normalize_num(w)
-                if w in self.vocab_word:
-                    word_id = self.vocab_word[w]
-                else:
-                    word_id = self.vocab_word[UNK]
-                word_ids.append(word_id)
-
-            words.append(word_ids)
-            if self.char_feature:
-                chars.append(char_ids)
-
-        if y is not None:
-            y = [[self.vocab_tag[t] for t in sent] for sent in y]
-
-        if self.padding:
-            words = pad_sequences(words, padding='post')
-            chars = pad_char(chars)
-            y = pad_sequences(y, padding='post')
-            y = to_categorical(y, len(self.vocab_tag))
-            sents = [words, chars]
-        else:
-            sents = [words, chars]
-
-        if self.return_lengths:
-            lengths = np.asarray(lengths, dtype=np.int32)
-            lengths = lengths.reshape((lengths.shape[0], 1))
-            sents.append(lengths)
-
-        return (sents, y) if y is not None else sents
-
-    def inverse_transform(self, y):
-        indice_tag = {i: t for t, i in self.vocab_tag.items()}
-        return [indice_tag[y_] for y_ in y]
-
-    def _get_char_ids(self, word):
-        return [self.vocab_char.get(c, self.vocab_char[UNK]) for c in word]
-
-    def _lower(self, word):
-        return word.lower() if self.lowercase else word
-
-    def _normalize_num(self, word):
-        if self.num_norm:
-            return re.sub(r'[0-9０１２３４５６７８９]', r'0', word)
-        else:
-            return word
-
     def save(self, file_path):
         joblib.dump(self, file_path)
 
     @classmethod
     def load(cls, file_path):
         p = joblib.load(file_path)
+
         return p