In [22]:
#!/usr/bin/env python

# Import standard libraires
import os
import numpy as np
import itertools
from collections import OrderedDict
from pprint import pprint
import re
import scipy.io
import codecs
import cPickle

# Import Tensorflow, and corresponding functions
import tensorflow as tf
from utils import set_values, get_name


# Import custom helper functions
from utils import create_input
import loader
from utils import models_path, evaluate, eval_script, eval_temp
from loader import word_mapping, char_mapping, tag_mapping
from loader import update_tag_scheme, prepare_dataset
from loader import augment_with_pretrained


"""
Data Variables
"""
train="data/esp.train" 
dev="data/esp.testa" 
test="data/esp.testb"
tag_scheme="iobes"         # Tagging scheme (IOB or IOBES)
pre_emb=""                 # Location of pretrained embeddings
all_emb=False              # Load all embeddings


"""
Model Variables
"""
lower=False                # Lowercase words (this will not affect character inputs)
zeros=False                # Replace digits with 0
char_dim=25                # Char embedding dimension
char_lstm_dim=25           # Char LSTM hidden layer size
char_bidirect=True         # Use a bidirectional LSTM for chars
word_dim=100               # Token embedding dimension
word_lstm_dim=100          # Token LSTM hidden layer size
word_bidirect=True         # Use a bidirectional LSTM for words
cap_dim=0                  # Capitalization feature dimension (0 to disable)
crf=True                   # Use CRF (False to disable)
dropout=0.5                # Droupout on the input (0 = no dropout)
lr_method="sgd-lr_.005"    # Learning method (SGD, Adadelta, Adam..)
reload_model=0             # Reload the last saved model

"""
Training variables
"""
singletons = set([word_to_id[k] for k, v
                  in dico_words_train.items() if v == 1])
n_epochs = 100  # number of epochs over the training set
freq_eval = 1000  # evaluate on dev every freq_eval steps


# Parse parameters
parameters = OrderedDict()
parameters['tag_scheme']    = tag_scheme
parameters['lower']         = lower
parameters['zeros']         = zeros
parameters['char_dim']      = char_dim
parameters['char_lstm_dim'] = char_lstm_dim
parameters['char_bidirect'] = char_bidirect
parameters['word_dim']      = word_dim
parameters['word_lstm_dim'] = word_lstm_dim
parameters['word_bidirect'] = word_bidirect
parameters['pre_emb']       = pre_emb
parameters['all_emb']       = all_emb
parameters['cap_dim']       = cap_dim
parameters['crf']           = crf
parameters['dropout']       = dropout
parameters['lr_method']     = lr_method

# Check parameters validity
assert os.path.isfile(train)
assert os.path.isfile(dev)
assert os.path.isfile(test)
assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0
assert 0. <= parameters['dropout'] < 1.0
assert parameters['tag_scheme'] in ['iob', 'iobes']
assert not parameters['all_emb'] or parameters['pre_emb']
assert not parameters['pre_emb'] or parameters['word_dim'] > 0
assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb'])

# Check evaluation script / folders
if not os.path.isfile(eval_script):
    raise Exception('CoNLL evaluation script not found at "%s"' % eval_script)
if not os.path.exists(eval_temp):
    os.makedirs(eval_temp)
if not os.path.exists(models_path):
    os.makedirs(models_path)


# Data parameters
lower = parameters['lower']
zeros = parameters['zeros']
tag_scheme = parameters['tag_scheme']

# Load sentences
train_sentences = loader.load_sentences(train, lower, zeros)
dev_sentences = loader.load_sentences(dev, lower, zeros)
test_sentences = loader.load_sentences(test, lower, zeros)

# Use selected tagging scheme (IOB / IOBES)
update_tag_scheme(train_sentences, tag_scheme)
update_tag_scheme(dev_sentences, tag_scheme)
update_tag_scheme(test_sentences, tag_scheme)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
    dico_words_train = word_mapping(train_sentences, lower)[0]
    dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(),
        parameters['pre_emb'],
        list(itertools.chain.from_iterable(
            [[w[0] for w in s] for s in dev_sentences + test_sentences])
        ) if not parameters['all_emb'] else None
    )
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
    dico_words_train = dico_words

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

# Index data
train_data = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id, lower
)
dev_data = prepare_dataset(
    dev_sentences, word_to_id, char_to_id, tag_to_id, lower
)
test_data = prepare_dataset(
    test_sentences, word_to_id, char_to_id, tag_to_id, lower
)

print "%i / %i / %i sentences in train / dev / test." % (
    len(train_data), len(dev_data), len(test_data))

Found 26100 unique words (264715 in total)
Found 91 unique characters
Found 17 unique named entity tags
8323 / 1915 / 1517 sentences in train / dev / test.


In [29]:
pprint(train_data[0])
for array in create_input(train_data[0], parameters, True, singletons):
    print array

{'caps': [2, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0],
 'chars': [[36, 0, 7, 15, 2, 11, 5, 3, 0],
           [47],
           [27, 11, 4, 9, 5, 1, 7, 6, 1],
           [48],
           [14],
           [38, 56],
           [13, 1, 20],
           [47],
           [22, 42, 22],
           [48],
           [18]],
 'str_words': [u'Melbourne',
               u'(',
               u'Australia',
               u')',
               u',',
               u'25',
               u'may',
               u'(',
               u'EFE',
               u')',
               u'.'],
 'tags': [2, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0],
 'words': [5666, 17, 2104, 16, 2, 193, 33, 17, 28, 16, 5]}
[5666, 17, 2104, 16, 2, 193, 33, 17, 28, 16, 5]
[[36, 0, 7, 15, 2, 11, 5, 3, 0], [47, 0, 0, 0, 0, 0, 0, 0, 0], [27, 11, 4, 9, 5, 1, 7, 6, 1], [48, 0, 0, 0, 0, 0, 0, 0, 0], [14, 0, 0, 0, 0, 0, 0, 0, 0], [38, 56, 0, 0, 0, 0, 0, 0, 0], [13, 1, 20, 0, 0, 0, 0, 0, 0], [47, 0, 0, 0, 0, 0, 0, 0, 0], [22, 42, 22, 0, 0, 0, 0, 0, 0], [48, 0, 0, 0, 0,

In [None]:

class Model(object):
    """
    Network architecture.
    """
    def __init__(self, parameters=None, models_path=None, model_path=None):
        """
        Initialize the model. We either provide the parameters and a path where
        we store the models, or the location of a trained model.
        """
        # If there is no existing model create it
        if model_path is None: 
            assert parameters and models_path
            # Create a name based on the parameters
            self.parameters = parameters
            self.name = get_name(parameters)
            # Model location
            model_path = os.path.join(models_path, self.name)
            self.model_path = model_path
            self.parameters_path = os.path.join(model_path, 'parameters.pkl')
            self.mappings_path = os.path.join(model_path, 'mappings.pkl')
            # Create directory for the model if it does not exist
            if not os.path.exists(self.model_path):
                os.makedirs(self.model_path)
            # Save the parameters to disk
            with open(self.parameters_path, 'wb') as f:
                cPickle.dump(parameters, f)
        # Else load an existing model into memory
        else:
            assert parameters is None and models_path is None
            # Model location
            self.model_path = model_path
            self.parameters_path = os.path.join(model_path, 'parameters.pkl')
            self.mappings_path = os.path.join(model_path, 'mappings.pkl')
            # Load the parameters and the mappings from disk
            with open(self.parameters_path, 'rb') as f:
                self.parameters = cPickle.load(f)
            self.reload_mappings()
        self.components = {}

    def save_mappings(self, id_to_word, id_to_char, id_to_tag):
        """
        We need to save the mappings if we want to use the model later.
        """
        self.id_to_word = id_to_word
        self.id_to_char = id_to_char
        self.id_to_tag = id_to_tag
        with open(self.mappings_path, 'wb') as f:
            mappings = {
                'id_to_word': self.id_to_word,
                'id_to_char': self.id_to_char,
                'id_to_tag': self.id_to_tag,
            }
            cPickle.dump(mappings, f)

    def reload_mappings(self):
        """
        Load mappings from disk.
        """
        with open(self.mappings_path, 'rb') as f:
            mappings = cPickle.load(f)
        self.id_to_word = mappings['id_to_word']
        self.id_to_char = mappings['id_to_char']
        self.id_to_tag = mappings['id_to_tag']

    def add_component(self, param):
        """
        Add a new parameter to the network.
        """
        if param.name in self.components:
            raise Exception('The network already has a parameter "%s"!'
                            % param.name)
        self.components[param.name] = param

    def save(self):
        """
        Write components values to disk.
        """
        for name, param in self.components.items():
            param_path = os.path.join(self.model_path, "%s.mat" % name)
            if hasattr(param, 'params'):
                param_values = {p.name: p.get_value() for p in param.params}
            else:
                param_values = {name: param.get_value()}
            scipy.io.savemat(param_path, param_values)

    def reload(self):
        """
        Load components values from disk.
        """
        for name, param in self.components.items():
            param_path = os.path.join(self.model_path, "%s.mat" % name)
            param_values = scipy.io.loadmat(param_path)
            if hasattr(param, 'params'):
                for p in param.params:
                    set_values(p.name, p, param_values[p.name])
            else:
                set_values(name, param, param_values[name])

    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim: 
            n_cap = 4

        # Network variables
        word_ids = tf.placeholder(tf.int32, name='is_train')
        char_for_ids = tf.placeholder(tf.int32, name='char_for_ids')
        char_rev_ids = tf.placeholder(tf.int32, name='char_rev_ids')
        char_pos_ids = tf.placeholder(tf.int32, name='char_pos_ids')
        tag_ids = tf.placeholder(tf.int32, name='tag_ids')
        
        # Setting up placeholder, this is where your data enters the graph!
        x_image_pl = tf.placeholder(tf.float32, [None, height, width, channels], name="x_image_pl")
        x_margin_pl = tf.placeholder(tf.float32, [None, NUM_FEATURES], name="x_margin_pl")
        x_shape_pl = tf.placeholder(tf.float32, [None, NUM_FEATURES], name="x_shape_pl")
        x_texture_pl = tf.placeholder(tf.float32, [None, NUM_FEATURES], name="x_texture_pl")
        is_training_pl = tf.placeholder(tf.bool, name="is_training_pl")


In [None]:
"""
Start training the model
"""

best_dev = -np.inf
best_test = -np.inf
count = 0
for epoch in xrange(n_epochs):
    epoch_costs = []
    print "Starting epoch %i..." % epoch
    for i, index in enumerate(np.random.permutation(len(train_data))):
        count += 1
        input = create_input(train_data[index], parameters, True, singletons)
        #new_cost = f_train(*input)
        new_cost = 0
        epoch_costs.append(new_cost)
        if i % 50 == 0 and i > 0 == 0:
            print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
        if count % freq_eval == 0:
            dev_score = evaluate(parameters, f_eval, dev_sentences,
                                 dev_data, id_to_tag, dico_tags)
            test_score = evaluate(parameters, f_eval, test_sentences,
                                  test_data, id_to_tag, dico_tags)
            print "Score on dev: %.5f" % dev_score
            print "Score on test: %.5f" % test_score
            if dev_score > best_dev:
                best_dev = dev_score
                print "New best score on dev."
                print "Saving model to disk..."
                model.save()
            if test_score > best_test:
                best_test = test_score
                print "New best score on test."
    print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs))