# BiLSTM sequence labeling in Tensorflow

This notebook shows how to implement a simple bidirectional LSTM sequence labeling model with dropout in Tensorflow.

# Prerequisites

In [9]:
import tensorflow as tf
import numpy as np

In [10]:
import os
def write_graph(name):
    logdir = os.path.join('log', name)
    file_writer = tf.summary.FileWriter(logdir)
    with tf.Session() as sess:
        file_writer.add_graph(sess.graph)

# Hyperparams

Here we define some random embeddings and hyper parameters just to get an example to work with.

In [11]:
n_classes = 10
num_layers = 2
state_size = 100
keep_prob = 0.5
vocab_size = 1000
embedding_size = 50

embedding_vectors = np.random.rand(vocab_size, embedding_size).astype(np.float32)

`embedding_vectors`: [vocab_size x embedding_size]


# Input placeholders and embeddings

In [13]:
x = tf.placeholder(tf.int32, [None, None], name='x')
seqlen = tf.placeholder(tf.int32, [None], name='seqlen')

word_embeddings = tf.Variable(
    initial_value=embedding_vectors,
    trainable=False,
    name="word_embeddings")
rnn_inputs = tf.nn.embedding_lookup(word_embeddings, x)

write_graph("step_1")

`rnn_inputs`: [batch_size x max_length x embedding_size]


# BLSTM model


## Defining the LSTM cell

In [14]:
def lstm_cell_with_dropout(state_size, keep_prob):
    cell = tf.contrib.rnn.BasicLSTMCell(num_units=state_size)
    cell = tf.contrib.rnn.DropoutWrapper(
        cell=cell,
        output_keep_prob=keep_prob,
        state_keep_prob=keep_prob,
        variational_recurrent=True,
        dtype=tf.float32)
    return cell

## Defining a BLSTM layer

In [15]:
def blstm_layer_with_dropout(inputs, seqlen, state_size, keep_prob, scope):
    cell = lstm_cell_with_dropout(state_size, keep_prob)
    (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=cell,
        cell_bw=cell,
        inputs=inputs,
        sequence_length=seqlen,
        dtype=tf.float32,
        scope=scope)
    return tf.concat([output_fw, output_bw], axis=-1)

## Adding layers to the graph

In [16]:
for i in range(num_layers):
    with tf.name_scope("BLSTM-{}".format(i)) as scope:
        rnn_inputs = blstm_layer_with_dropout(
            rnn_inputs, seqlen, state_size, keep_prob, scope)

write_graph("step_2")


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


# Making predictions

In [17]:
max_length = tf.shape(x)[1]

with tf.name_scope('logits'):
    logit_inputs = tf.reshape(rnn_inputs, [-1, 2 * state_size])
    logits = tf.layers.dense(logit_inputs, n_classes)
    logits = tf.reshape(logits, [-1, max_length, n_classes])
predictions = tf.argmax(logits, axis=-1, name="predictions")

write_graph("step_3")

Instructions for updating:
Use keras.layers.dense instead.


# Defining a loss function and optimizer

In order to actually train the network we need a loss function to optimise and an optimization method. You can either carefully choose the optimizer and learning rate, or you can use Adam with the Karpathy constant (3e-4) for learning rate like we do here. (JK, this is important to tune.)

In [18]:
labels = tf.placeholder(tf.int32, [None, None], name='labels')

with tf.name_scope('loss'):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=labels, 
        logits=logits, 
        name="cross_entropy")
    seqlen_mask = tf.sequence_mask(
        lengths=seqlen, 
        maxlen=max_length, 
        name='sequence_mask')
    loss = tf.boolean_mask(loss, mask=seqlen_mask)
    loss = tf.reduce_mean(loss, name="mean_loss")

with tf.name_scope('train_step'):
    train_step = tf.train.AdamOptimizer(learning_rate=3e-4).minimize(loss)

write_graph("step_4")

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


# Monitoring accuracy

In [19]:
train_summ = tf.summary.scalar("cross_entropy", loss)

# Training loop

Here we make some fake data to demo a training loop.

In [20]:
n_examples = 1000
min_sequence_length = 5
max_sequence_length = 40
batch_size = 100

sequence_lengths = np.random.randint(min_sequence_length, max_sequence_length, size=n_examples)
X = np.zeros([n_examples,max_sequence_length], dtype=np.int32)
train_labels = np.zeros([n_examples,max_sequence_length], dtype=np.int32)

for i,length in enumerate(sequence_lengths):
    X[i,0:length] = np.random.randint(vocab_size, size=length)
    train_labels[i,0:length] = np.random.randint(n_classes, size=length)

There is no connection between input and labels, since they are both random, but the network is happy to just memorize the data.

In [22]:
n_epochs = 2

def data_gen():
    i = 0
    idx = 0
    while idx < len(sequence_lengths):
        slc = slice(idx, idx+batch_size)
        yield (X[slc], sequence_lengths[slc], train_labels[slc])
        i += 1
        idx = i * batch_size

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    train_writer = tf.summary.FileWriter(os.path.join('log', 'train'), sess.graph)
    step = 0
    for i in range(n_epochs):
        for (X_batch, lengths, labels_batch) in data_gen():
            feed_dict = {x: X_batch, seqlen: lengths, labels: labels_batch}
            _, summ = sess.run([train_step, train_summ], feed_dict=feed_dict)
            train_writer.add_summary(summ, step)
            step += 1
    train_writer.close() 

In [28]:
predictor = tf.contrib.predictor.from_saved_model("")

In [None]:
#input_dict = {
#        'x': input_data,
#        'seqlen': [len(input_data)]
#    }
input = 
output = predictor(input_dict)