Skip to content

Commit

Permalink
Update trainer
Browse files Browse the repository at this point in the history
  • Loading branch information
Hironsan committed May 31, 2018
1 parent 63dbacc commit 09f0f46
Show file tree
Hide file tree
Showing 8 changed files with 224 additions and 154 deletions.
2 changes: 2 additions & 0 deletions anago/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from anago.tagger import Tagger
from anago.trainer import Trainer
from anago.wrapper import Sequence
40 changes: 15 additions & 25 deletions anago/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,35 @@
Model Trainer.
"""
from anago.utils import batch_iter
from anago.callbacks import get_callbacks
from anago.callbacks import F1score


class Trainer(object):

def __init__(self, model, loss='categorical_crossentropy', optimizer='adam',
max_epoch=15, batch_size=32, checkpoint_path=None,
log_dir=None, preprocessor=None, early_stopping=False, inverse_transform=None):
def __init__(self, model, preprocessor=None, inverse_transform=None):
self._model = model
self._loss = loss
self._optimizer = optimizer
self._max_epoch = max_epoch
self._batch_size = batch_size
self._checkpoint_path = checkpoint_path
self._log_dir = log_dir
self._early_stopping = early_stopping
self._preprocessor = preprocessor
self._inverse_transform = inverse_transform

def train(self, x_train, y_train, x_valid=None, y_valid=None):
def train(self, x_train, y_train, x_valid=None, y_valid=None,
epochs=1, batch_size=32, verbose=1, callbacks=None):

# Prepare training and validation data(steps, generator)
train_steps, train_batches = batch_iter(x_train, y_train,
self._batch_size,
batch_size,
preprocessor=self._preprocessor)
valid_steps, valid_batches = batch_iter(x_valid, y_valid,
self._batch_size,
preprocessor=self._preprocessor)

self._model.compile(loss=self._loss, optimizer=self._optimizer)

# Prepare callbacks
callbacks = get_callbacks(log_dir=self._log_dir,
checkpoint_dir=self._checkpoint_path,
early_stopping=self._early_stopping,
valid=(valid_steps, valid_batches, self._inverse_transform))
if x_valid and y_valid:
valid_steps, valid_batches = batch_iter(x_valid, y_valid,
batch_size,
preprocessor=self._preprocessor)
f1 = F1score(valid_steps, valid_batches,
preprocessor=self._inverse_transform)
callbacks = callbacks + [f1] if callbacks else [f1]

# Train the model
self._model.fit_generator(generator=train_batches,
steps_per_epoch=train_steps,
epochs=self._max_epoch,
callbacks=callbacks)
epochs=epochs,
callbacks=callbacks,
verbose=verbose)
139 changes: 102 additions & 37 deletions anago/wrapper.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,120 @@
import os

import numpy as np

"""
Wrapper class.
"""
from anago.models import BiLSTMCRF
from anago.preprocessing import IndexTransformer, DynamicPreprocessor
from anago.tagger import Tagger
from anago.trainer import Trainer

from seqeval.metrics import f1_score

class Sequence(object):

config_file = 'config.json'
weight_file = 'model_weights.h5'
preprocessor_file = 'preprocessor.pkl'
class Sequence(object):

def __init__(self, char_emb_size=25, word_emb_size=100, char_lstm_units=25,
word_lstm_units=100, dropout=0.5, char_feature=True, crf=True,
batch_size=20, optimizer='adam', learning_rate=0.001, lr_decay=0.9,
clip_gradients=5.0, max_epoch=15, early_stopping=True, patience=3,
train_embeddings=True, max_checkpoints_to_keep=5, log_dir=None,
embeddings=()):
def __init__(self,
word_embedding_dim=100,
char_embedding_dim=25,
word_lstm_size=100,
char_lstm_size=25,
fc_dim=100,
dropout=0.5,
embeddings=None,
use_char=True,
use_crf=True,
initial_vocab=None,
optimizer='adam'):

self.model = None
self.p = None
self.dp = None
self.log_dir = log_dir
self.embeddings = embeddings

def train(self, x_train, y_train, x_valid=None, y_valid=None, initial_vocab=None):
self.p = IndexTransformer(initial_vocab=initial_vocab)
self.p.fit(x_train, y_train)
x_train, y_train = self.p.transform(x_train, y_train)
x_valid, y_valid = self.p.transform(x_valid, y_valid)
self.dp = DynamicPreprocessor(num_labels=self.p.label_size)
self.word_embedding_dim = word_embedding_dim
self.char_embedding_dim = char_embedding_dim
self.word_lstm_size = word_lstm_size
self.char_lstm_size = char_lstm_size
self.fc_dim = fc_dim
self.dropout = dropout
self.embeddings = embeddings
self.use_char = use_char
self.use_crf = use_crf
self.initial_vocab = initial_vocab
self.optimizer = optimizer

def fit(self, x_train, y_train, x_valid=None, y_valid=None,
epochs=1, batch_size=32, verbose=1, callbacks=None):
"""Fit the model according to the given training data.
Args:
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples,)
Target vector relative to X.
Returns:
self : object.
"""
# Build preprocessors.
p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char)
p.fit(x_train, y_train)
x_train, y_train = p.transform(x_train, y_train)
x_valid, y_valid = p.transform(x_valid, y_valid)
dp = DynamicPreprocessor(num_labels=p.label_size)

# Build a model.
self.model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size,
word_vocab_size=self.p.word_vocab_size,
num_labels=self.p.label_size)
self.model.build()
model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
word_vocab_size=p.word_vocab_size,
num_labels=p.label_size,
word_embedding_dim=self.word_embedding_dim,
char_embedding_dim=self.char_embedding_dim,
word_lstm_size=self.word_lstm_size,
char_lstm_size=self.char_lstm_size,
fc_dim=self.fc_dim,
dropout=self.dropout,
embeddings=self.embeddings,
use_char=self.use_char,
use_crf=self.use_crf)
model.build()
model.compile(loss=model.get_loss(), optimizer=self.optimizer)

# Train the model.
trainer = Trainer(self.model, self.model.get_loss(), preprocessor=self.dp,
inverse_transform=self.p.inverse_transform)
trainer.train(x_train, y_train, x_valid, y_valid)
trainer = Trainer(model, preprocessor=dp,
inverse_transform=p.inverse_transform)
trainer.train(x_train, y_train, x_valid, y_valid,
epochs=epochs, batch_size=batch_size,
verbose=verbose, callbacks=callbacks)

self.p = p
self.dp = dp
self.model = model

return self

def score(self, x_test, y_test):
"""Returns the mean accuracy on the given test data and labels.
In multi-label classification, this is the subset accuracy
which is a harsh metric since you require for each sample that
each label set be correctly predicted.
Args:
X : array-like, shape = (n_samples, n_features)
Test samples.
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
True labels for X.
def eval(self, x_test, y_test):
Returns:
score : float
Mean accuracy of self.predict(X) wrt. y.
"""
if self.model:
evaluator = Evaluator(self.model, preprocessor=self.p)
evaluator.eval(x_test, y_test)
x_test = self.p.transform(x_test)
x_test = self.dp.transform(x_test)
y_pred = self.model.predict(x_test)
score = f1_score(y_test, y_pred)
return score
else:
raise (OSError('Could not find a model. Call load(dir_path).'))

Expand All @@ -59,18 +125,17 @@ def analyze(self, words):
else:
raise (OSError('Could not find a model. Call load(dir_path).'))

def save(self, dir_path):
self.p.save(os.path.join(dir_path, self.preprocessor_file))
self.model_config.save(os.path.join(dir_path, self.config_file))
self.model.save(os.path.join(dir_path, self.weight_file))
def save(self, weights_file, params_file, preprocessor_file):
self.p.save(preprocessor_file)
self.model.save(weights_file, params_file)

@classmethod
def load(cls, weights_file, params_file, preprocessor_file):
self = cls()

# Load preprocessor
self.p = IndexTransformer.load(preprocessor_file)
self.dp = DynamicPreprocessor(p.label_size)
self.dp = DynamicPreprocessor(self.p.label_size)

# Load the model.
self.model = BiLSTMCRF.load(weights_file, params_file)
Expand Down
18 changes: 10 additions & 8 deletions examples/download_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
import anago
from anago.utils import download, load_data_and_labels

dir_path = 'test_dir'
url = 'https://storage.googleapis.com/chakki/datasets/public/models.zip'
DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner')

test_path = os.path.join(DATA_ROOT, 'test.txt')
x_test, y_test = load_data_and_labels(test_path)
if __name__ == '__main__':
dir_path = 'test_dir'
url = 'https://storage.googleapis.com/chakki/datasets/public/models.zip'
DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner')

download(url, dir_path)
test_path = os.path.join(DATA_ROOT, 'test.txt')
x_test, y_test = load_data_and_labels(test_path)

model = anago.Sequence.load(dir_path)
model.eval(x_test, y_test)
download(url, dir_path)

model = anago.Sequence.load(dir_path)
model.score(x_test, y_test)
27 changes: 14 additions & 13 deletions examples/ner_glove.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,21 @@ def load_glove(file):
return model


DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner')
EMBEDDING_PATH = 'glove.6B.100d.txt'
if __name__ == '__main__':
DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner')
EMBEDDING_PATH = 'glove.6B.100d.txt'

train_path = os.path.join(DATA_ROOT, 'train.txt')
valid_path = os.path.join(DATA_ROOT, 'valid.txt')
train_path = os.path.join(DATA_ROOT, 'train.txt')
valid_path = os.path.join(DATA_ROOT, 'valid.txt')

print('Loading data...')
x_train, y_train = load_data_and_labels(train_path)
x_valid, y_valid = load_data_and_labels(valid_path)
print(len(x_train), 'train sequences')
print(len(x_valid), 'valid sequences')
print('Loading data...')
x_train, y_train = load_data_and_labels(train_path)
x_valid, y_valid = load_data_and_labels(valid_path)
print(len(x_train), 'train sequences')
print(len(x_valid), 'valid sequences')

embeddings = load_glove(EMBEDDING_PATH)
embeddings = load_glove(EMBEDDING_PATH)

# Use pre-trained word embeddings
model = anago.Sequence(max_epoch=1, embeddings=embeddings)
model.train(x_train, y_train, x_valid, y_valid)
# Use pre-trained word embeddings
model = anago.Sequence(embeddings=embeddings)
model.fit(x_train, y_train, x_valid, y_valid)
28 changes: 15 additions & 13 deletions examples/ner_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,22 @@
import anago
from anago.utils import load_data_and_labels

DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner')
EMBEDDING_PATH = 'model.txt'

train_path = os.path.join(DATA_ROOT, 'train.txt')
valid_path = os.path.join(DATA_ROOT, 'valid.txt')
if __name__ == '__main__':
DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner')
EMBEDDING_PATH = 'model.txt'

print('Loading data...')
x_train, y_train = load_data_and_labels(train_path)
x_valid, y_valid = load_data_and_labels(valid_path)
print(len(x_train), 'train sequences')
print(len(x_valid), 'valid sequences')
train_path = os.path.join(DATA_ROOT, 'train.txt')
valid_path = os.path.join(DATA_ROOT, 'valid.txt')

embeddings = KeyedVectors.load_word2vec_format(EMBEDDING_PATH).wv
print('Loading data...')
x_train, y_train = load_data_and_labels(train_path)
x_valid, y_valid = load_data_and_labels(valid_path)
print(len(x_train), 'train sequences')
print(len(x_valid), 'valid sequences')

# Use pre-trained word embeddings
model = anago.Sequence(max_epoch=1, embeddings=embeddings)
model.train(x_train, y_train, x_valid, y_valid)
embeddings = KeyedVectors.load_word2vec_format(EMBEDDING_PATH).wv

# Use pre-trained word embeddings
model = anago.Sequence(embeddings=embeddings)
model.fit(x_train, y_train, x_valid, y_valid)
25 changes: 17 additions & 8 deletions tests/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,28 +46,37 @@ def setUp(self):
word_vocab_size=self.p.word_vocab_size,
num_labels=self.p.label_size)
self.model.build()
self.model.compile(loss=self.model.get_loss(), optimizer='adam')

def test_train(self):
# Train the model.
trainer = Trainer(self.model, self.model.get_loss(), preprocessor=self.dp,
trainer = Trainer(self.model, preprocessor=self.dp,
inverse_transform=self.p.inverse_transform)
trainer.train(self.x_train, self.y_train,
x_valid=self.x_valid, y_valid=self.y_valid)

def test_train_no_valid(self):
trainer = Trainer(self.model, preprocessor=self.dp,
inverse_transform=self.p.inverse_transform)
trainer.train(self.x_train, self.y_train, self.x_valid, self.y_valid)
trainer.train(self.x_train, self.y_train)

def test_train_without_crf(self):
model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size,
word_vocab_size=self.p.word_vocab_size,
num_labels=self.p.label_size,
use_crf=False)
model.build()
trainer = Trainer(self.model, self.model.get_loss(), preprocessor=self.dp,
self.model.compile(loss=self.model.get_loss(), optimizer='adam')
trainer = Trainer(self.model, preprocessor=self.dp,
inverse_transform=self.p.inverse_transform)
trainer.train(self.x_train, self.y_train, self.x_valid, self.y_valid)
trainer.train(self.x_train, self.y_train,
x_valid=self.x_valid, y_valid=self.y_valid)

def test_save(self):
# Train the model.
trainer = Trainer(self.model, self.model.get_loss(), preprocessor=self.dp,
inverse_transform=self.p.inverse_transform, max_epoch=1)
trainer.train(self.x_train, self.y_train, self.x_valid, self.y_valid)
trainer = Trainer(self.model, preprocessor=self.dp,
inverse_transform=self.p.inverse_transform)
trainer.train(self.x_train, self.y_train,
x_valid=self.x_valid, y_valid=self.y_valid)

# Save the model.
self.model.save(self.weights_file, self.params_file)
Expand Down

0 comments on commit 09f0f46

Please sign in to comment.