In [0]:
import tensorflow as tf
import gensim
import numpy as np
import math

In [0]:
corpus = open('train.txt',encoding="utf8").readlines()
text_corpus = open('valid.txt',encoding="utf8").readlines()

In [0]:
def word2vec(corpus):
  sentence = []
  point = []
  for line in corpus:
      stripped_line = line.strip().split(' ')
      point.append(stripped_line)
      if line == '\n':
          sentence.append(point[:-1])
          point = []
  sentence = sentence[:-1]
  text_sentence = [[c[0] for c in x1]for x1 in sentence]
  tags = [[c[-1] for c in y] for y in sentence]
  list_sequence_length = [len(s) for s in sentence]
  #build word2vec model
  model = gensim.models.Word2Vec(text_sentence,size=150,window=10,min_count=1,workers=10)
  model.train(text_sentence, total_examples=len(text_sentence), epochs=10)
  return model,text_sentence,tags,list_sequence_length

In [0]:
def encode_tag(tag):
    if 'B-MISC' in tag:
        return 1
    elif 'I-MISC' in tag:
        return 2
    elif 'B-PER' in tag:
        return 3
    elif 'I-PER' in tag:
        return 4
    elif 'B-LOC' in tag:
        return 5
    elif 'I-LOC' in tag:
        return 6
    elif 'B-ORG' in tag:
        return 7
    elif 'I-ORG' in tag:
        return 8
    else:
        return 0

In [0]:
def pad(sentence, max_length,dim,isTg = False):
    pad_len = max_length - len(sentence)
    padding = np.zeros(pad_len)
    if isTg == False:
        padding = [np.zeros(dim) for i in range(0,pad_len)]
    return np.concatenate((sentence, padding))

In [0]:
len(list_sequence_length_train)

14986

In [0]:
def batch(data,labels,batch_size,sequence_length):
    n_batch = int(math.ceil(14986/batch_size)) # 14986 is number of sentence in X_trains
    index = 0
    for _ in range(n_batch):
        batch_sequence_lengths = np.array(sequence_length[index:index+batch_size])
        if(batch_sequence_lengths.size > 0):
            max_sequence_length_in_batch = max(batch_sequence_lengths)
        batch_data = np.array([x for x in data[index:index+batch_size]])
        batch_labels = [y for y in labels[index:index+batch_size]]
        index += batch_size
        #batch_data = batch_data.reshape(-1, max_sequence_length_in_batch, input_size)
        yield batch_data,batch_labels,max_sequence_length_in_batch,batch_sequence_lengths
    

In [0]:
model,text_sentence,tags,list_sequence_length_train = word2vec(corpus)
test_model,test_text_sentence,test_tag,list_sequence_length_test = word2vec(text_corpus)
en_tag = [[encode_tag(t) for t in tag] for tag in tags]
en_test_tag = [[encode_tag(t) for t in tag] for tag in test_tag]

In [0]:
# Bidirectional LSTM + CRF model.
learning_rate = 0.001
training_epochs = 100
input_size = 150
label_size = 9
batch_size = 32
num_units = 128 # the number of units in the LSTM cell
number_of_classes = 9
max_length = 150

x_trains = [pad([model[w] for w in s],max_length,input_size) for s in text_sentence]
x_test = [pad([test_model[w] for w in s],max_length,input_size) for s in test_text_sentence]
y_trains = [pad(t,max_length,input_size,True) for t in en_tag]
y_test = [pad(t,max_length,input_size,True) for t in en_test_tag]

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [0]:
input_data = tf.placeholder(tf.float32, [None, None, input_size], name="input_data") # shape = (batch, batch_seq_len, input_size)
labels = tf.placeholder(tf.int32, shape=[None, None], name="labels") # shape = (batch, sentence)
batch_sequence_length = tf.placeholder(tf.int32) # max sequence length in batch
original_sequence_lengths = tf.placeholder(tf.int32, [None])

# Scope is mandatory to use LSTMCell (https://github.com/tensorflow/tensorflow/issues/799).
with tf.name_scope("BiLSTM"):
    with tf.variable_scope('forward'):
        lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(num_units, forget_bias=1.0, state_is_tuple=True)
    with tf.variable_scope('backward'):
        lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(num_units, forget_bias=1.0, state_is_tuple=True)
    (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell, 
                                                                     cell_bw=lstm_bw_cell, 
                                                                     inputs=input_data,
                                                                     sequence_length=original_sequence_lengths, 
                                                                     dtype=tf.float32,
                                                                     scope="BiLSTM")

# As we have a Bi-LSTM, we have two outputs which are not connected, so we need to merge them.
outputs = tf.concat([output_fw, output_bw], axis=2)

# Fully connected layer.
W = tf.get_variable(name="W", shape=[2 * num_units, number_of_classes],
                dtype=tf.float32)

b = tf.get_variable(name="b", shape=[number_of_classes], dtype=tf.float32,
                initializer=tf.zeros_initializer())

outputs_flat = tf.reshape(outputs, [-1, 2 * num_units])
pred = tf.matmul(outputs_flat, W) + b
#scores = tf.reshape(pred, [-1, batch_sequence_length, number_of_classes])
scores = tf.reshape(pred, [-1, tf.shape(outputs)[1], number_of_classes])

In [0]:
# Linear-CRF.
#dense_y = tf.argmax(labels, -1, output_type=tf.int32)
log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(scores, labels, original_sequence_lengths)

loss = tf.reduce_mean(-log_likelihood)

losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=scores, labels=labels)
# shape = (batch, sentence, nclasses)
mask = tf.sequence_mask(original_sequence_lengths)
# apply mask
losses = tf.boolean_mask(losses, mask)

loss = tf.reduce_mean(losses)
# Compute the viterbi sequence and score (used for prediction and test time).
viterbi_sequence, viterbi_score = tf.contrib.crf.crf_decode(scores, transition_params, original_sequence_lengths)

# Training ops.
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.minimize(loss)

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [0]:
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    for i in range(training_epochs):
        for batch_data,batch_labels,max_sequence_length,batch_sequence_lengths in batch(data=x_trains,labels=y_trains,
                                                                                        batch_size=batch_size,
                                                                                        sequence_length=list_sequence_length_train):
            if(batch_data.shape != (0,)):
                tf_viterbi_sequence,_ = session.run([viterbi_sequence,train_op],
                                                    feed_dict={input_data:batch_data,
                                                               labels:batch_labels,
                                                               batch_sequence_length:max_sequence_length,
                                                               original_sequence_lengths:batch_sequence_lengths})
        if i% 10 ==0:
                # Create a mask to fix input lengths.
                total_labels = np.sum(batch_sequence_lengths)
                mask = (np.expand_dims(np.arange(150), axis=0) <
                    np.expand_dims(batch_sequence_lengths, axis=1))
                correct_labels = np.sum((batch_labels == tf_viterbi_sequence) * mask)
                accuracy = 100.0 * correct_labels / float(total_labels)
                print("Epoch: %d" % i, "Accuracy: %.2f%%" % accuracy)
        
    print("Test Accuracy: %.2f%%" % )
    
    print("Finished")
            
        

[20 31 35 36 19 22 24 32  1  7  2 21 13 10 14 19 23 17 17 16 16 10 21 19
  7  1  8  2  5 39 33 43]
Epoch: 0 Accuracy: 87.65%
[20 31 35 36 19 22 24 32  1  7  2 21 13 10 14 19 23 17 17 16 16 10 21 19
  7  1  8  2  5 39 33 43]
Epoch: 10 Accuracy: 88.34%
[20 31 35 36 19 22 24 32  1  7  2 21 13 10 14 19 23 17 17 16 16 10 21 19
  7  1  8  2  5 39 33 43]
Epoch: 20 Accuracy: 89.37%
[20 31 35 36 19 22 24 32  1  7  2 21 13 10 14 19 23 17 17 16 16 10 21 19
  7  1  8  2  5 39 33 43]
Epoch: 30 Accuracy: 91.08%
[20 31 35 36 19 22 24 32  1  7  2 21 13 10 14 19 23 17 17 16 16 10 21 19
  7  1  8  2  5 39 33 43]
Epoch: 40 Accuracy: 90.05%
[20 31 35 36 19 22 24 32  1  7  2 21 13 10 14 19 23 17 17 16 16 10 21 19
  7  1  8  2  5 39 33 43]
Epoch: 50 Accuracy: 90.39%
[20 31 35 36 19 22 24 32  1  7  2 21 13 10 14 19 23 17 17 16 16 10 21 19
  7  1  8  2  5 39 33 43]
Epoch: 60 Accuracy: 90.39%
[20 31 35 36 19 22 24 32  1  7  2 21 13 10 14 19 23 17 17 16 16 10 21 19
  7  1  8  2  5 39 33 43]
Epoch: 70 Accuracy: 