In [7]:
import tensorflow as tf
import numpy as np
import functools
import random
import argparse
from multiprocessing import Pool
from input import get_train_data,get_test_data,get_final_data
tf.logging.set_verbosity(tf.logging.INFO)

WORD_DIM = 313
MAX_SEQ_LEN = 30
NUM_CLASSES = 9
BATCH_SIZE = 256
NUM_HIDDEN = 256
NUM_LAYERS = 2
NUM_EPOCH = 2000


In [9]:
"""
Load data into memory 
"""
print "loading data into memory"
pool = Pool(processes=3)
train_result = pool.apply_async(get_train_data)
test_a_result = pool.apply_async(get_test_data)
test_b_result = pool.apply_async(get_final_data)

test_inp, test_out = test_a_result.get()
print "test_a data loaded"

final_inp, final_out = test_b_result.get()
print "test_b data loaded"

train_inp, train_out = train_result.get()
print "train data loaded"

loading data into memory
test_a data loaded
test_b data loaded
train data loaded


In [10]:
"""
Analyse the data properly
"""
print([len(f) for f in train_out[:10]])

no_of_batches = (len(train_inp) + BATCH_SIZE - 1) / BATCH_SIZE

[30, 30, 30, 30, 30, 30, 30, 30, 30, 30]


In [None]:
"""
Create the tensorflow model used to train the NER reacogniser
"""
def lazy_property(function):
    attribute = '_' + function.__name__
    @property
    @functools.wraps(function)
    def wrapper(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)
    return wrapper


class Model():

    def __init__(self, data, target, dropout, num_hidden, num_layers):
        self.data = data
        self.target = target
        self.dropout = dropout
        self._num_hidden = num_hidden
        self._num_layers = num_layers
        self.prediction
        self.error
        self.optimize

    @lazy_property
    def prediction(self):
        rnn_cell = tf.nn.rnn_cell
        # Try: LSTMBlock cell or GruBlock cell
        fw_cell = rnn_cell.LSTMCell(self._num_hidden, state_is_tuple=True)
        bw_cell = rnn_cell.LSTMCell(self._num_hidden, state_is_tuple=True)

        if self._num_layers > 1:
            fw_cell = rnn_cell.MultiRNNCell([fw_cell] * self._num_layers, state_is_tuple=True)
            fw_cell = rnn_cell.DropoutWrapper(fw_cell, output_keep_prob=self.dropout)
            bw_cell = rnn_cell.MultiRNNCell([bw_cell] * self._num_layers, state_is_tuple=True)
            bw_cell = rnn_cell.DropoutWrapper(bw_cell, output_keep_prob=self.dropout)
        else:
            fw_cell = rnn_cell.DropoutWrapper(fw_cell, output_keep_prob=self.dropout)
            bw_cell = rnn_cell.DropoutWrapper(bw_cell, output_keep_prob=self.dropout)

        # Try: Dynamic Bidirectional RNN
        output, _, _ = tf.nn.bidirectional_rnn(fw_cell, 
                                               bw_cell, 
                                               tf.unpack(tf.transpose(self.data, perm=[1, 0, 2])), 
                                               dtype=tf.float32, 
                                               sequence_length=self.length)


        max_length = int(self.target.get_shape()[1])
        num_classes = int(self.target.get_shape()[2])
        weight, bias = self._weight_and_bias(2*self._num_hidden, num_classes)
        output = tf.reshape(tf.transpose(tf.pack(output), perm=[1, 0, 2]), [-1, 2 * self._num_hidden])
        prediction = tf.nn.softmax(tf.matmul(output, weight) + bias)
        prediction = tf.reshape(prediction, [-1, max_length, num_classes])
        return prediction


    @lazy_property
    def length(self):
        used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length


    @lazy_property
    def cost(self):
        cross_entropy = self.target * tf.log(self.prediction)
        cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2) # Summarize the values of 2 axis
        
        # Check if the maximum value on the secondary axis is positive or negative
        mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2)) 
        cross_entropy *= mask # Ensure the cross_entropy is positive (by multiplying either with -1 or 1)
        cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1) # Summarize the values on the primary axis
        cross_entropy /= tf.cast(self.length, tf.float32) # Convert all dimensions of the vector to 32float.
        return tf.reduce_mean(cross_entropy) # Reduce the vector to the mean value on all dimensions


    @lazy_property
    def optimize(self):
        optimizer = tf.train.AdamOptimizer(0.003)
        return optimizer.minimize(self.cost)


    @lazy_property
    def error(self):
        mistakes = tf.not_equal(
            tf.argmax(self.target, 2), tf.argmax(self.prediction, 2))

        mistakes = tf.cast(mistakes, tf.float32)
        mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
        mistakes *= mask
        # Average over actual sequence lengths.
        mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
        mistakes /= tf.cast(self.length, tf.float32)
        return tf.reduce_mean(mistakes)

    @staticmethod
    def _weight_and_bias(in_size,out_size):
        weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
        bias = tf.constant(0.1, shape=[out_size])
        return tf.Variable(weight), tf.Variable(bias)

    @lazy_property
    def getpredf1(self):
        return self.prediction, self.length
 

In [17]:
from pprint import pprint
"""
Define the methods needed to train the model
"""
def f1(prediction,target,length):
    tp=np.array([0]*(NUM_CLASSES+1))
    fp=np.array([0]*(NUM_CLASSES+1))
    fn=np.array([0]*(NUM_CLASSES+1))

    target = np.argmax(target, 2)
    prediction = np.argmax(prediction, 2)


    for i in range(len(target)):
        for j in range(length[i]):
            if target[i][j] == prediction[i][j]:
                tp[target[i][j]] += 1
            else:
                fp[target[i][j]] += 1
                fn[prediction[i][j]] += 1

    NON_NAMED_ENTITY = 11
    for i in range(NUM_CLASSES):
        if i != NON_NAMED_ENTITY:
            tp[NUM_CLASSES] += tp[i]
            fp[NUM_CLASSES] += fp[i]
            fn[NUM_CLASSES] += fn[i]

    precision = []
    recall = []
    fscore = []
    for i in range(NUM_CLASSES+1):
        precision.append(tp[i]*1.0/(tp[i]+fp[i]))
        recall.append(tp[i]*1.0/(tp[i]+ fn[i]))
        fscore.append(2.0*precision[i]*recall[i]/(precision[i]+recall[i]))

    print "precision = {}".format(["{:10.4f}%".format(f) for f in precision])
    print "recall = {}".format(["{:10.4f}%".format(f) for f in recall])
    print "f1score = {}".format(["{:10.4f}%".format(f) for f in fscore])
                            
    return fscore[NUM_CLASSES]


def train():
    with tf.Graph().as_default():
        print("defining variables")
        data = tf.placeholder(tf.float32,[None, MAX_SEQ_LEN, WORD_DIM])
        target = tf.placeholder(tf.float32, [None, MAX_SEQ_LEN, NUM_CLASSES])
        dropout = tf.placeholder(tf.float32)
        model = Model(data,target,dropout,NUM_HIDDEN,NUM_LAYERS)
        maximum = 0

        with tf.Session() as sess:
            print("initializing variables")
            sess.run(tf.initialize_all_variables())
            saver = tf.train.Saver()
            
            print("starting training")
            for epoch in range(200):
                ptr=0
                for batch_number in range(no_of_batches):
                    batch_inp, batch_out = train_inp[ptr:ptr+BATCH_SIZE], train_out[ptr:ptr+BATCH_SIZE]
                    ptr += BATCH_SIZE
                    sess.run(model.optimize,{data: np.array(batch_inp),
                                             target: np.array(batch_out),
                                             dropout: 0.5})
                if epoch % 10 == 0:
                    save_path = saver.save(sess, "model/model.ckpt")
                    print("Model saved in file: %s" % save_path)
                pred = sess.run(model.prediction, {data: test_inp, target: test_out, dropout: 1})
                pred,length = sess.run(model.getpredf1, {data: test_inp, target: test_out, dropout: 1})
                print "Epoch:" + str(epoch), "TestA score,"
                m = f1(pred,test_out,length)
                if m > maximum:
                    maximum = m
                    save_path = saver.save(sess, "model/model_max.ckpt")
                    print("Max Model saved in file: %s" % save_path)
                    pred = sess.run(model.prediction, {data: final_inp, target: final_out, dropout: 1})
                    pred,length = sess.run(model.getpredf1, {data: final_inp, target: final_out, dropout: 1})
                    print "TestB score,"
                    f1(pred,final_out,length)
                    print"\n\n"

In [18]:
"""
Train the model
"""
train()

defining variables
initializing variables
starting training
Model saved in file: model/model.ckpt
Epoch:0 TestA score,
precision =  ['    0.0000%', '    0.0000%', '    1.0000%', '    0.0000%', '    0.0000%', '    0.0000%', '    0.0000%', '    0.0000%', '    0.0000%', '    0.9015%']
recall =  ['       nan%', '       nan%', '    0.9015%', '       nan%', '       nan%', '       nan%', '       nan%', '       nan%', '       nan%', '    0.9015%']
f1score =  ['       nan%', '       nan%', '    0.9482%', '       nan%', '       nan%', '       nan%', '       nan%', '       nan%', '       nan%', '    0.9015%']
Max Model saved in file: model/model_max.ckpt
TestB score,
precision =  ['    0.0000%', '    0.0000%', '    1.0000%', '    0.0000%', '    0.0000%', '    0.0000%', '    0.0000%', '    0.0000%', '    0.0000%', '    0.9164%']
recall =  ['       nan%', '       nan%', '    0.9164%', '       nan%', '       nan%', '       nan%', '       nan%', '       nan%', '       nan%', '    0.9164%']
f1score = 

KeyboardInterrupt: 