Skip to content

Commit

Permalink
Update preprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
Hironsan committed Mar 6, 2018
1 parent 3bb18db commit 5ba3568
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 54 deletions.
3 changes: 3 additions & 0 deletions anago/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ def transform(self, X, y=None):

return (inputs, y) if y is not None else inputs

def fit_transform(self, X, y=None, **fit_params):
return self.fit(X, y).transform(X, y)

def inverse_transform(self, docs):
id2label = {i: t for t, i in self.label_dic.items()}

Expand Down
81 changes: 27 additions & 54 deletions tests/test_preprocess.py
Original file line number Diff line number Diff line change
@@ -1,117 +1,90 @@
import os
import time
import unittest

import numpy as np

from anago.utils import load_data_and_labels
from anago.preprocess import StaticPreprocessor, DynamicPreprocessor, UNK, pad_char
from anago.preprocess import StaticPreprocessor


class TestStaticPreprocessor(unittest.TestCase):

def setUp(self):
self.filename = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner/test.txt')
self.p = StaticPreprocessor()

def test_preprocessor(self):
X, y = load_data_and_labels(self.filename)
p = StaticPreprocessor()
p = p.fit(X, y)
X, y = p.transform(X, y)
words, chars = X
char, word = chars[0][0][0], words[0][0]
tag = y[0][0]
self.assertIsInstance(word, int)
self.assertIsInstance(char, int)
self.assertIsInstance(tag, int)
self.assertIsInstance(p.inverse_transform(y), list)
self.assertIsInstance(p.inverse_transform(y)[0], list)
self.assertIsInstance(p.inverse_transform(y)[0][0], str)


class WordPreprocessorTest(unittest.TestCase):

def setUp(self):
self.filename = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner/test.txt')
@classmethod
def setUpClass(cls):
filename = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner/test.txt')
cls.X, cls.y = load_data_and_labels(filename)

def test_preprocessor(self):
X, y = load_data_and_labels(self.filename)
preprocessor = StaticPreprocessor()
p = preprocessor.fit(X, y)
X, y = p.transform(X, y)
X, y = self.p.fit_transform(self.X, self.y)
words, chars = X
char, word = chars[0][0][0], words[0][0]
tag = y[0][0]
self.assertIsInstance(word, int)
self.assertIsInstance(char, int)
self.assertIsInstance(tag, int)
self.assertIsInstance(p.inverse_transform(y), list)
self.assertIsInstance(self.p.inverse_transform(y), list)
self.assertIsInstance(self.p.inverse_transform(y)[0], list)
self.assertIsInstance(self.p.inverse_transform(y)[0][0], str)

def test_transform_only_words(self):
X, y = load_data_and_labels(self.filename)
preprocessor = StaticPreprocessor()
p = preprocessor.fit(X, y)
X = p.transform(X)
self.p.fit(self.X, self.y)
X = self.p.transform(self.X)
words, chars = X
char, word = chars[0][0][0], words[0][0]
self.assertIsInstance(word, int)
self.assertIsInstance(char, int)

def test_unknown_word(self):
X, y = load_data_and_labels(self.filename)
preprocessor = StaticPreprocessor()
p = preprocessor.fit(X, y)
self.p = StaticPreprocessor()
self.p.fit(self.X, self.y)
X = [['$unknownword$', 'あ']]
y = [['O', 'O']]
X, y = p.transform(X, y)
X, y = self.p.transform(X, y)
print(X)

def test_vocab_init(self):
X, y = load_data_and_labels(self.filename)
unknown_word = 'unknownword'
X_test, y_test = [[unknown_word]], [['O']]

preprocessor = StaticPreprocessor()
p = preprocessor.fit(X, y)
X_pred, _ = p.transform(X_test, y_test)
words = X_pred[0]
self.assertEqual(words, [p.word_dic[UNK]])
self.p.fit(self.X, self.y)
x_pred = self.p.transform(X_test)
words = x_pred[0]
self.assertEqual(words, [self.p.word_dic[UNK]])

vocab_init = {unknown_word}
preprocessor = StaticPreprocessor(vocab_init=vocab_init)
p = preprocessor.fit(X, y)
X_pred, _ = p.transform(X_test, y_test)
p = StaticPreprocessor(vocab_init=vocab_init)
p.fit(self.X, self.y)
X_pred = p.transform(X_test)
words = X_pred[0]
self.assertNotEqual(words, [p.word_dic[UNK]])

def test_save(self):
preprocessor = StaticPreprocessor()
filepath = os.path.join(os.path.dirname(__file__), 'data/preprocessor.pkl')
preprocessor.save(filepath)
self.p.save(filepath)
self.assertTrue(os.path.exists(filepath))
if os.path.exists(filepath):
os.remove(filepath)

def test_load(self):
X, y = load_data_and_labels(self.filename)
p = StaticPreprocessor()
p.fit(X, y)
self.p.fit(self.X, self.y)
filepath = os.path.join(os.path.dirname(__file__), 'data/preprocessor.pkl')
p.save(filepath)
self.p.save(filepath)
self.assertTrue(os.path.exists(filepath))

loaded_p = StaticPreprocessor.load(filepath)
x_test1, y_test1 = p.transform(X, y)
x_test2, y_test2 = loaded_p.transform(X, y)
x_test1, y_test1 = self.p.transform(self.X, self.y)
x_test2, y_test2 = loaded_p.transform(self.X, self.y)
np.testing.assert_array_equal(x_test1[0], x_test2[0]) # word
np.testing.assert_array_equal(x_test1[1], x_test2[1]) # char
np.testing.assert_array_equal(y_test1, y_test2)
if os.path.exists(filepath):
os.remove(filepath)


class PreprocessTest(unittest.TestCase):
class TestPreprocess(unittest.TestCase):

def test_pad_char(self):
sequences = [[[1, 2, 3, 4], [1, 2], [1], [1, 2, 3]],
Expand Down

0 comments on commit 5ba3568

Please sign in to comment.