In [8]:
from __future__ import print_function
import numpy as np

import tensorflow as tf
from tensorflow.contrib import rnn

# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


# Static LSTM

In [1]:
# reset graph
tf.reset_default_graph()
# parameter define #######################################################
# Training Parameters
learning_rate = 0.001
training_steps = 10000
batch_size = 128 ####
display_step = 200

# Network Parameters
num_input = 28 # MNIST data input (img shape: 28*28)
timesteps = 28 # timesteps
num_hidden = 128 # hidden layer num of features
num_classes = 10 # MNIST total classes (0-9 digits)


# build computational graph###############################################

# tf Graph input
X = tf.placeholder("float", [None, timesteps, num_input])
Y = tf.placeholder("float", [None, num_classes])

# Define weights
weights = {
    'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))
}
biases = {
    'out': tf.Variable(tf.random_normal([num_classes]))
}


def RNN(x, weights, biases):

    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, timesteps, n_input)
    # Required shape: 'timesteps' tensors list of shape (batch_size, n_input)
    
    # Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
    # list[tensor1,... tensor28], tensori.shape = (batch_size, n_input)
    x = tf.unstack(x, timesteps, 1)

    # Define a lstm cell with tensorflow
    lstm_cell = rnn.LSTMCell(num_hidden, forget_bias=1.0)

    # Get lstm cell output
    outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

# logits means: tge non-normalized predictions for a classification model, 
# it need to be normalized by a softmax to get the probabilities for each possible class
logits = RNN(X, weights, biases)
prediction = tf.nn.softmax(logits)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    logits=logits, labels=Y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()


# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    for step in range(1, training_steps+1):
        batch_x, batch_y = mnist.train.next_batch(batch_size)
        # Reshape data to get 28 seq of 28 elements, batch_x become nparrary (batch_size, 28, 28)
        batch_x = batch_x.reshape((batch_size, timesteps, num_input))
        # Run optimization op (backprop)
        sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
        if step % display_step == 0 or step == 1:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: batch_x,
                                                                 Y: batch_y})
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.4f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

    # Calculate accuracy for 128 mnist test images
    test_len = 128
    test_data = mnist.test.images[:test_len].reshape((-1, timesteps, num_input))
    test_label = mnist.test.labels[:test_len]
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={X: test_data, Y: test_label}))

### Understand:
 * change .rnn.BasicLSTMCell() --> .rnn.LSTMCell() work
 * tf.unstack()
 * a RNN network can be split into two parts:
     * lstm_cell = rnn.LSTMCell(num_hidden, forget_bias=1.0): build the unrolled hidden state
     * outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32): introduce the time step to roll up above cell

 * LSTMCell()
     * num_units
     * num_proj: output projection
     * activation: for inner statesL tanh
 * static_rnn:
     * output: list of time step size
     * state: only final state

### Question:
 * how RNN update, time step by time step, detail is skip by the code
 * what if sentence with different lenght
     * static_rnn vs. dynamic_rnn
 * 

# Dynamic_LSTM

In [None]:
def dynamicRNN(x, seqlen, weights, biases):

    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, n_steps, n_input)
    # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
    
    # Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
    x = tf.unstack(x, seq_max_len, 1)

    # Define a lstm cell with tensorflow
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(n_hidden)

    # Get lstm cell output, providing 'sequence_length' will perform dynamic
    # calculation.
    outputs, states = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32,
                                sequence_length=seqlen)

    # When performing dynamic calculation, we must retrieve the last
    # dynamically computed output, i.e., if a sequence length is 10, we need
    # to retrieve the 10th output.
    # However TensorFlow doesn't support advanced indexing yet, so we build
    # a custom op that for each sample in batch size, get its length and
    # get the corresponding relevant output.

    # 'outputs' is a list of output at every timestep, we pack them in a Tensor
    # and change back dimension to [batch_size, n_step, n_input]
    outputs = tf.stack(outputs)
    outputs = tf.transpose(outputs, [1, 0, 2])

    # Hack to build the indexing and retrieve the right output.
    batch_size = tf.shape(outputs)[0]
    # Start indices for each sample
    index = tf.range(0, batch_size) * seq_max_len + (seqlen - 1)
    # Indexing
    outputs = tf.gather(tf.reshape(outputs, [-1, n_hidden]), index)

    # Linear activation, using outputs computed above
    return tf.matmul(outputs, weights['out']) + biases['out']

 * use stiac_rnn + sequence_length
 * may need function to etract the last output, since different lengths in batch
 * the other part can be the same

In [None]:
def BiRNN(x, weights, biases):

    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, timesteps, n_input)
    # Required shape: 'timesteps' tensors list of shape (batch_size, num_input)

    # Unstack to get a list of 'timesteps' tensors of shape (batch_size, num_input)
    x = tf.unstack(x, timesteps, 1)

    # Define lstm cells with tensorflow
    # Forward direction cell
    lstm_fw_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)
    # Backward direction cell
    lstm_bw_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)

    # Get lstm cell output
    try:
        outputs, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x,
                                              dtype=tf.float32)
    except Exception: # Old TensorFlow version only returns outputs not states
        outputs = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x,
                                        dtype=tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

 * use lstm_fw_cell + lstm_bw_cell + static_bidirectional_rnn
 * the other part can be the same
 * the ouptut format is [time step list:[tensor1], ...[tensor_last_time_slot]]
 * tensorflow receive batches data, compute all together
 * 

# CFR-BiLSTM
 * need to pad both to give same longth input for world embedd and charact embedd.
 * put the time dimension on axis=1 for dynamic_rnn

### build concatenated word embedding

 * tf.reshape()
   * x*y*z == x1*y1*z1 == a*b; product stay constant
   * -1, that dimension be computed according
   * [-1] : flatten
   
 * the sub-LSTM first output the the embedding for all the word in batch at once
     * the dynamic_rnn need
         * shape[1] the element need to be x_i in each time step
         * shape[0] be the batch_size
         * shape[2] be the dimension for x_i
         * and compute the state in batch.
 * when coding tensorflow, we should keep the shape of tensor like the code below every time 
 * Thanks to the sequence_length argument, for the unvalid time steps, the dynamic_rnn passes the state through and outputs a vector of zeros.
     * so the output of dynamic_rnn is also have same length(==max_length),but 0 vector for those invaild time steps.
     * the network is fully unrolled for the given (passed in) length(s) of the sequence(s) or completely unrolled if length(s) is not given.
 

In [None]:
# get batch of word embedding for sentences #######################################
# pad sentence 
# shape = (batch size, max length of sentence in batch)
word_ids = tf.placeholder(tf.int32, shape=[None, None])
# shape = (batch size)
sequence_lengths = tf.placeholder(tf.int32, shape=[None])
# load embedding and pad character embedd ############
L = tf.Variable(embeddings, dtype=tf.float32, trainable=False)
# shape = (batch, sentence, word_vector_size)
pretrained_embeddings = tf.nn.embedding_lookup(L, word_ids)



# get batch of untrained character embedding for sentences-word-level #######################################
# shape = (batch size, max length of sentence, max length of word)
char_ids = tf.placeholder(tf.int32, shape=[None, None, None])
# shape = (batch_size, max_length of sentence)
word_lengths = tf.placeholder(tf.int32, shape=[None, None])
# dynamic pad: pad to the maximum length in the batch. Thus, sentence length and word length will depend on the batch.
# randomly initial the character embedding 
# 1. get character embeddings, using default xavier_initializer
K = tf.get_variable(name="char_embeddings", dtype=tf.float32,
    shape=[nchars, dim_char])
# shape = (batch, sentence, word, dim of char embeddings)
char_embeddings = tf.nn.embedding_lookup(K, char_ids)



# get trained of character embedding for sentences-word-level #######################################
# need to reshape into the 3-D, and the middle dimension should be the time slot element
# 2. put the time dimension on axis=1 for dynamic_rnn
s = tf.shape(char_embeddings) # store old shape
# shape = (batch x sentence, word, dim of char embeddings)
char_embeddings = tf.reshape(char_embeddings, shape=[-1, s[-2], s[-1]])
word_lengths = tf.reshape(self.word_lengths, shape=[-1])
# 3. bi lstm on chars
cell_fw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True)
cell_bw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True)
_, ((_, output_fw), (_, output_bw)) = tf.nn.bidirectional_dynamic_rnn(cell_fw,
    cell_bw, char_embeddings, sequence_length=word_lengths,
    dtype=tf.float32)
# shape = (batch x sentence, 2 x char_hidden_size)
output = tf.concat([output_fw, output_bw], axis=-1)
# shape = (batch, sentence, 2 x char_hidden_size)
char_rep = tf.reshape(output, shape=[-1, s[1], 2*char_hidden_size])


# get final embedding for sentences-word-level #######################################
# shape = (batch, sentence, 2 x char_hidden_size + word_vector_size)
word_embeddings = tf.concat([pretrained_embeddings, char_rep], axis=-1)

## sentecnce-level LSTM
* in charact BiLSTM we extract only the last state vs. in word BiLSTM we need states from every time step.
   * output_states: A tuple (output_state_fw, output_state_bw) containing the forward and the backward final states of bidirectional rnn.
   * output is also the h in LSTM

In [None]:
cell_fw = tf.contrib.rnn.LSTMCell(hidden_size)
cell_bw = tf.contrib.rnn.LSTMCell(hidden_size)

(output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw,
    cell_bw, word_embeddings, sequence_length=sequence_lengths,
    dtype=tf.float32)

# ??shape = (batch, sentence, 2 x hidden_size)
context_rep = tf.concat([output_fw, output_bw], axis=-1)

## CRF 
 * for each word that has a vector presentation, use a FCNN to compute a vector of scores for each **"possible tag"**
      * need to know the n_possible_tag
 * from here two methods for decoding
     * softmax: locally decision
     * linear-chain CRF: use of the neighbooring tagging decisions.
 * could use tf.sequence_mask to deal with the invalid padden in output score
 * the stack of CRF and LSTM is one line easily

In [None]:
# get a score vector for each word ###################################################
# a FCNN to generate scores of all the tag for each word
W = tf.get_variable("W", shape=[2*self.config.hidden_size, self.config.ntags],
                dtype=tf.float32)

b = tf.get_variable("b", shape=[self.config.ntags], dtype=tf.float32,
                initializer=tf.zeros_initializer())

ntime_steps = tf.shape(context_rep)[1]
# shape = (batch x sentence, 2 x hidden_size)
context_rep_flat = tf.reshape(context_rep, [-1, 2*hidden_size])
# shape = (batch x sentence, ntags)
pred = tf.matmul(context_rep_flat, W) + b
# shape = (batch, sentence, ntags)
scores = tf.reshape(pred, [-1, ntime_steps, ntags])



# real crf for loss #######################################################################
# shape = (batch, sentence) , ture labels, trainsition _param is need for later prediciton
labels = tf.placeholder(tf.int32, shape=[None, None], name="labels")

log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
scores, labels, sequence_lengths)
loss = tf.reduce_mean(-log_likelihood)

# possible locally softmax loss #########################################
# losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=scores, labels=labels)
# # shape = (batch, sentence, nclasses)
# mask = tf.sequence_mask(sequence_lengths)
# # apply mask
# losses = tf.boolean_mask(losses, mask)
# loss = tf.reduce_mean(losses)
#########################################################################



# optimizer
optimizer = tf.train.AdamOptimizer(self.lr)
train_op = optimizer.minimize(self.loss)






# find the best sequence
# shape = (sentence, nclasses)
viterbi_sequence, viterbi_score = tf.contrib.crf.viterbi_decode(
                                score, transition_params)

# labels_pred = tf.cast(tf.argmax(self.logits, axis=-1), tf.int32)



In [None]:
question:
    1. is the oupt also be padded in the same size? will check if later also use the "sequence_lengths" parameter
      ntime_steps is a scala or a vector
       Yes final ues sequence_lengths in crf and softmax
        

# NER_MODEL

In [None]:
import numpy as np
import os
import tensorflow as tf

class NER_Model(GENERIC_Model):
    """here is the class for NER specialized model"""
   
    # load config file 
    def __init__(self, config):
        
        # setting configuration in GENERIC_MODEL class
        super(NER_Model, self).__init__(config)
        
        # ????
        self.idx_to_tag = {idx: tag for tag, idx in
                           self.config.vocab_tags.items()}
    
    
    """NER computational graph"""
    
    def add_placeholders(self):
        
        # shape = [batch_size, max_length of sentence in this batch]
        self.word_ids = tf.placeholder(tf.int32, shape = [None, None], name = "word_ids")
        
        # shape = [batch_size]
        self.sentence_lengths = tf.placeholder(tf.int32, shpae=[None], name = "sentence_lengths")
        
        # shape =[batch_size, max_length of sentence, max_length of word]
        self.char_ids = tf.placeholder(tf.int32, shape = [None, None, None], name = "char_ids")
        
        # shape = [batch_size, max_length of sentence]
        self.word_lengths = tf.placeholder(tf.int32, shpae=[None, None], name = "word_lengths")
        
        # shape = [batch_size, max_length of sentence]
        self.labels = tf.placeholder(tf.int32, shape = [None, None], name = "labels")
        
        
        
        # hyper parameters
        # scalar
        self.dropout = tf.placeholder(tf.float32, shape=[], name = "dropout")
        # scalar
        self.lr = tf.placeholder(tf.float32, shape=[], name = "lr")
        
   
    def get_feed_dict(self, words, labels = None, lr = None, dropout = None):
        """
        data pre-processing
        from batch of sentences of words to word_ids + char_ids
        """
        if self.config.use_chars:
            char_ids, word_ids =zip(*words)
            word_ids, sentence_lengths = pad_sequences(word_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids, pad_tok =0, nlevels =2)
        else :
            word_ids, sentence_lengths = pad_sequences(word_ids, 0)
            
        # feed dict
        feed = {
            self.word_ids : word_ids, 
            self.self.sentence_lengths: sentence_lengths
        }
        
        if self.config.use_chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths
            
        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed[self.labels] = labels
            
        if lr is not None:
            feed[self.lr] = lr
        
        if droput is not None:
            feed[self.dropout] = dropout
            
        return feed, sentence_lengths
    
    
    def add_final_embedding_op(self):
        """
        generate char embedding by run sub-LSTM
        concatenate word embedding and char embedding to get the final embedding
        """
        
        # pre-trained word embedding 
        with tf.variable_scope("words"):
            
            _word_embeddings_lookup_table = tf.Variable(
            self.config.embeddings,
            name = "_word_embeddings_lookup_table",
            dtype = tf.float32,
            trainable = self.config.train_embeddings)
            
            # shape = [batch_size, max_length of sentence in this batch, word_embedding_size]
            word_embeddings = tf.nn.embedding_lookup(
            _word_embeddings_lookup_table,
            self.words_ids,
            name = "word_embeddings")
                
        # run sub-LSTM to got char embedding
        with tf.variable_scope("chars"):
            if self.config.use_chars:
                # initial the char embedding table
                char_embedding_table = tf.get_variable(
                    name = "char_embedding_table",
                    dtype = tf.float32,
                    shape = [self.config.nchars, self.config.dim_char])
                
                char_embeddings = tf.nn.embedding_lookup(char_embedding_table,
                                                        self.char_ids,
                                                       name = "char_embeddings")
               
                # reshape the char_embeddings as the time dimension on axis=1 need for bidirectional_dynamic_rnn
                shape = tf.shape(char_embeddings)
                char_embeddings = tf.reshape(char_embedding,
                                            shape = [-1, shape[-2], shape[-1]])
                word_lengths = tf.reshape(self.word_lengths, shape =[shape[0]*shape[1]])
                
                # run LSTM on char, c_state and m_state
                cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
                                                 state_is_tuple = True, name = "char_Cell_fw")
                cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
                                                 state_is_tuple = True, name = "char_Cell_bw")
                _output = tf.nn.bidrectional_dynamic_rnn(cell_fw, cell_bw, char_embeddings, 
                                                        sequence_length = word_lengths, dtype = tf.float32)
                
                # concat the two direction vector
                _, ((_, output_fw),(_, output_bw)) = _output
                output = tf.concat([output_fw, output_bw], axis = -1)
                
                #shape = [batch size, max sentence length, char hidden size]
                output = tf.reshape(output, shape = [shape[0], shape[1], 2*self.config.hidden_size_char])
                
                # concat the final embedding
                word_embeddings = tf.concat([word_embedding, output], axis = -1)
        
        self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout)
        
    # Bi-LSTM run on sentence level
    def add_LSTM(self):
        with tf.variable_scope("Bi-LSTM"):
            cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw,self.world_embeddings,
                                                                       sequence_length = self.sentence_lengths, dtype  = tf.float32)
            output = tf.concat([output_fw, output_bw], axis = -1)
            output = tf.nn.dropout(output, self.dropout)
            
    # a FC layer to project   
    def add_FCNN(self):
        with tf.variable_scope("FCNN"):
            W = tf.get_variable("W", dtype = tf.float32,
                           shape = [2*self.config.hidden_size_lstm, self.config.ntags])
            
            b = tf.get_variable("b", shape = [self.config.ntags], 
                                dtype = tf.float32, initializer = tf.zeros_initializer())
            
            nsteps = tp.shape(output)[1]
            output = tf.reshape(output, shape = [-1, 2*self.config.hidden_size_lstm])
            scores = tf.matmul(output, W) +b
            self.scores = tf.reshape(scores, shape = [-1, nsteps, self.config.ntags])
    
    # softmax locally prediction
    def add_Softmax(self):
        if not self.config.use_crf:
            self.label_preds = tf.cast(tf.argmax(self.scores, axis = -1), tf.int32)
            
        
    def add_loss_op(self):
        if self.config.use_crf:
            log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
                self.scores, self.labels, self.sentence_lengths)
            self.trans_params = trans_params
            
            self.loss = tf.reduce_mean(-log_likelihood)
        # if predict locally
        else: 
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logists = self.scores, labels = self.labels)
            # mask the padding
            mask = tf.sequence_mask(self.sentence_lengths)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)
            
            
        #  tensorboard
        tf.summary.scalar("loss", self.loss)
    
    # build NER graph 
    def build(self):
        self.add_placeholders()
        self.add_final_embedding_op()
        self.add_LSTM()
        self.add_FCNN()
        if not self.config.use_crf:
            self.add_Softmax() 
        self.add_loss_op()
        
        
        self.add_train_op(self.config.lr_method, self.lf, self.loss, self.config.clip)
        self.initialize_session()
        
    def predict_batch(self, words):
        """need the words be converted into wordId first"""
        
        fd, sequence_lengths = self.get_feed_dict(words, dropout = 1.0)
        
        # if crf enable
        if self.config.use_crf:
            # get tag scores and transition matrix
            viterbi_sequences = []
            scores, trans_params = self.sess.run(
                [self.scores, self.trans_params], feed_dict = fd)
            
            # viterbi decode to find the best label sequence
            for score, sentence_length in zip(scores, sentence_lengths):
                score = score[:sentence_lengths]
                viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
                    score, trans_params)
                viterbi_sequences.append(viterbi_seq)
            return viterbi_sequences, sequence_lengths
        # if use softmax 
        elif not self.config.use_crf:
            label_preds = self.sess.run([self.label_preds], feed_dict =fd)
            return label_preds, sequence_lengths
        
        
        
    def predic(self, words_raw):
        """
            convert raw words into wordId and call predict_batch
            convect indx_tag to tag for output
        """
        words = [self.config.processing_word(w) for w in words_raw]
        if type (words(0)) == tuple:
            words = zip(*words)
        pred_ids, _self.predict_batch([words])
        preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]
    
        return preds
    
    def run_epoch(self, train_set, eval_set, epochs):
        """run complete pass over trainset and devset (one epoch)"""
        
        batch_size = self.config.batch_size
        ####
        ####
        
        for i, (words,labels) in enumerate(minibathes(train, batch_size)):
            fd, _ = self.get_feed_dict(words, labels, self.config.lr,
                                      self.config.dropout)
            _, train_loss, summary = self.sess.run([self.train_op, self.loss, self.merged],
                                                  feed_dict = fd)

        ####
        ####
        
        metrics = self.run_evaluate(dev)
        
        ####
        ####
        return metrics["f1"]
    
    def run_evaluate(self, test):
        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for words, labels in minibathes(test, self.config.batch_size):
            labels_pred, sequence_lengths = self.predict_batch(words)
            
            for lab, lab_pred, length in zip (labels, labels_pred, sequence_lengths):
                lab = lab[:length]
                lab_pred = lab_pred[:length]
                accs += [a==b for (a,b) in zip(lab, lab_pred)]
                
                lab_chunks      = set(get_chunks(lab, self.config.vocab_tags))
                lab_pred_chunks = set(get_chunks(lab_pred,
                                                 self.config.vocab_tags))

                correct_preds += len(lab_chunks & lab_pred_chunks)
                total_preds   += len(lab_pred_chunks)
                total_correct += len(lab_chunks)   
                
        p   = correct_preds / total_preds if correct_preds > 0 else 0
        r   = correct_preds / total_correct if correct_preds > 0 else 0
        f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)

        return {"acc": 100*acc, "f1": 100*f1}   

# Basic_Model

In [None]:
class BaseModel(object):
    """deal with generic NN model op"""
    
    def __init__(self, config):
        self.config = config
        self.sess = None
        self.saver = None
        
    def add_train_op(self, lr_method, lr, loss, clip = -1):
        
        _lr_m = lr_method.lower() # lower to make sure
        
        with tf.variable_scope("train_step"):
            if _lr_m == 'adam': # sgd method
                optimizer = tf.train.AdamOptimizer(lr)
            elif _lr_m == 'adagrad':
                optimizer = tf.train.AdagradOptimizer(lr)
            elif _lr_m == 'sgd':
                optimizer = tf.train.GradientDescentOptimizer(lr)
            elif _lr_m == 'rmsprop':
                optimizer = tf.train.RMSPropOptimizer(lr)
            else:
                raise NotImplementedError("Unknown method {}".format(_lr_m))

            if clip > 0: # gradient clipping if clip is positive
                grads, vs     = zip(*optimizer.compute_gradients(loss))
                grads, gnorm  = tf.clip_by_global_norm(grads, clip)
                self.train_op = optimizer.apply_gradients(zip(grads, vs))
            else:
                self.train_op = optimizer.minimize(loss) 
                
    def initialize_session(self):
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        
    def close_session(self):
        self.sess.close()
        
    def train(self, train, dev):
        best_score = 0
        nepoch_no_imprv = 0
        
        for epoch in range(self.config.nepochs):
            score = self.run_epoch(train, dev, epoch)
            
            # early stopping 
            if score >= best_score:
                nepoch_no_imprv = 0
                best_score = score
            else:
                nepoch_no_imprv += 1
                if nepoch_no_imprv >= self.config.nepoch_no_imprv:
                    break
                    
    def evalute(self, test):
        metrics = self.run_evaluate(test)

# mark

In [None]:
import numpy as np
import os
import tensorflow as tf

class NER_Model(GENERIC_Model):
    """here is the class for NER specialized model"""
   
    # load config file 
    def __init__(self, config):
        
        # setting configuration in GENERIC_MODEL class
        super(NER_Model, self).__init__(config)
        
        # ????
        self.idx_to_tag = {idx: tag for tag, idx in
                           self.config.vocab_tags.items()}
    
    
    """NER computational graph"""
    
    def add_placeholders(self):
        
        # shape = [batch_size, max_length of sentence in this batch]
        self.word_ids = tf.placeholder(tf.int32, shape = [None, None], name = "word_ids")
        
        # shape = [batch_size]
        self.sentence_lengths = tf.placeholder(tf.int32, shpae=[None], name = "sentence_lengths")
        
        # shape =[batch_size, max_length of sentence, max_length of word]
        self.char_ids = tf.placeholder(tf.int32, shape = [None, None, None], name = "char_ids")
        
        # shape = [batch_size, max_length of sentence]
        self.word_lengths = tf.placeholder(tf.int32, shpae=[None, None], name = "word_lengths")
        
        # shape = [batch_size, max_length of sentence]
        self.labels = tf.placeholder(tf.int32, shape = [None, None], name = "labels")
        
        
#####################################################
modified
#####################################################        
        # hyper parameters
        # scalar
        self.dropout = tf.placeholder(tf.float32, shape=[], name = "dropout")
        # scalar
        self.lr = tf.placeholder(tf.float32, shape=[], name = "lr")
#####################################################
modified
#####################################################


   
    def get_feed_dict(self, words, labels = None, lr = None, dropout = None):
        """
        data pre-processing
        from batch of sentences of words to word_ids + char_ids
        """
        if self.config.use_chars:
            char_ids, word_ids =zip(*words)
            word_ids, sentence_lengths = pad_sequences(word_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids, pad_tok =0, nlevels =2)
        else :
            word_ids, sentence_lengths = pad_sequences(word_ids, 0)
            
        # feed dict
        feed = {
            self.word_ids : word_ids, 
            self.self.sentence_lengths: sentence_lengths
        }
        
        if self.config.use_chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths
            
        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed[self.labels] = labels
            
        if lr is not None:
            feed[self.lr] = lr
        
        if droput is not None:
            feed[self.dropout] = dropout
            
        return feed, sentence_lengths
    
    
    def add_final_embedding_op(self):
        """
        generate char embedding by run sub-LSTM
        concatenate word embedding and char embedding to get the final embedding
        """
        
        # pre-trained word embedding 
        with tf.variable_scope("words"):
            
            _word_embeddings_lookup_table = tf.Variable(
            self.config.embeddings,
            name = "_word_embeddings_lookup_table",
            dtype = tf.float32,
            trainable = self.config.train_embeddings)
            
            # shape = [batch_size, max_length of sentence in this batch, word_embedding_size]
            word_embeddings = tf.nn.embedding_lookup(
            _word_embeddings_lookup_table,
            self.words_ids,
            name = "word_embeddings")
                
        # run sub-LSTM to got char embedding
        with tf.variable_scope("chars"):
            if self.config.use_chars:
                # initial the char embedding table
                char_embedding_table = tf.get_variable(
                    name = "char_embedding_table",
                    dtype = tf.float32,
                    shape = [self.config.nchars, self.config.dim_char])
                
                char_embeddings = tf.nn.embedding_lookup(char_embedding_table,
                                                        self.char_ids,
                                                       name = "char_embeddings")
               
                # reshape the char_embeddings as the time dimension on axis=1 need for bidirectional_dynamic_rnn
                shape = tf.shape(char_embeddings)
                char_embeddings = tf.reshape(char_embedding,
                                            shape = [-1, shape[-2], shape[-1]])
                word_lengths = tf.reshape(self.word_lengths, shape =[shape[0]*shape[1]])
                
                # run LSTM on char, c_state and m_state
                cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
                                                 state_is_tuple = True, name = "char_Cell_fw")
                cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
                                                 state_is_tuple = True, name = "char_Cell_bw")
                _output = tf.nn.bidrectional_dynamic_rnn(cell_fw, cell_bw, char_embeddings, 
                                                        sequence_length = word_lengths, dtype = tf.float32)
                
                # concat the two direction vector
                _, ((_, output_fw),(_, output_bw)) = _output
                output = tf.concat([output_fw, output_bw], axis = -1)
                
                #shape = [batch size, max sentence length, char hidden size]
                output = tf.reshape(output, shape = [shape[0], shape[1], 2*self.config.hidden_size_char])
                
                # concat the final embedding
                word_embeddings = tf.concat([word_embedding, output], axis = -1)
        
        self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout)
        
    # Bi-LSTM run on sentence level
    def add_LSTM(self):
        with tf.variable_scope("Bi-LSTM"):
            cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw,self.world_embeddings,
                                                                       sequence_length = self.sentence_lengths, dtype  = tf.float32)
            output = tf.concat([output_fw, output_bw], axis = -1)
            output = tf.nn.dropout(output, self.dropout)
            
    # a FC layer to project   
    def add_FCNN(self):
        with tf.variable_scope("FCNN"):
            W = tf.get_variable("W", dtype = tf.float32,
                           shape = [2*self.config.hidden_size_lstm, self.config.ntags])
            
            b = tf.get_variable("b", shape = [self.config.ntags], 
                                dtype = tf.float32, initializer = tf.zeros_initializer())
            
            nsteps = tp.shape(output)[1]
            output = tf.reshape(output, shape = [-1, 2*self.config.hidden_size_lstm])
            scores = tf.matmul(output, W) +b
            self.scores = tf.reshape(scores, shape = [-1, nsteps, self.config.ntags])
    
    # softmax locally prediction
    def add_Softmax(self):
        if not self.config.use_crf:
            self.label_preds = tf.cast(tf.argmax(self.scores, axis = -1), tf.int32)
            
        
    def add_loss_op(self):
        if self.config.use_crf:
            log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
                self.scores, self.labels, self.sentence_lengths)
            self.trans_params = trans_params
            
            self.loss = tf.reduce_mean(-log_likelihood)
        # if predict locally
        else: 
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logists = self.scores, labels = self.labels)
            # mask the padding
            mask = tf.sequence_mask(self.sentence_lengths)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)
            
            
        #  tensorboard
        tf.summary.scalar("loss", self.loss)
    
    # build NER graph 
    def build(self):
        self.add_placeholders()
        self.add_final_embedding_op()
        self.add_LSTM()
        self.add_FCNN()
        if not self.config.use_crf:
            self.add_Softmax() 
        self.add_loss_op()
        
        
        self.add_train_op(self.config.lr_method, self.lf, self.loss, self.config.clip)
        self.initialize_session()
        
    def predict_batch(self, words):
        """need the words be converted into wordId first"""
        
        fd, sequence_lengths = self.get_feed_dict(words, dropout = 1.0)
        
        # if crf enable
        if self.config.use_crf:
            # get tag scores and transition matrix
            viterbi_sequences = []
            scores, trans_params = self.sess.run(
                [self.scores, self.trans_params], feed_dict = fd)
            
            # viterbi decode to find the best label sequence
            for score, sentence_length in zip(scores, sentence_lengths):
                score = score[:sentence_lengths]
                viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
                    score, trans_params)
                viterbi_sequences.append(viterbi_seq)
            return viterbi_sequences, sequence_lengths
        # if use softmax 
        elif not self.config.use_crf:
            label_preds = self.sess.run([self.label_preds], feed_dict =fd)
            return label_preds, sequence_lengths
        
        
        
    def predic(self, words_raw):
        """
            convert raw words into wordId and call predict_batch
            convect indx_tag to tag for output
        """
        words = [self.config.processing_word(w) for w in words_raw]
        if type (words(0)) == tuple:
            words = zip(*words)
        pred_ids, _self.predict_batch([words])
        preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]
    
        return preds
    
    def run_epoch(self, train_set, eval_set, epochs):
        """run complete pass over trainset and devset (one epoch)"""
        
        batch_size = self.config.batch_size
        ####
        ####
        
        for i, (words,labels) in enumerate(minibathes(train, batch_size)):
            fd, _ = self.get_feed_dict(words, labels, self.config.lr,
                                      self.config.dropout)
            _, train_loss, summary = self.sess.run([self.train_op, self.loss, self.merged],
                                                  feed_dict = fd)

        ####
        ####
        
        metrics = self.run_evaluate(dev)
        
        ####
        ####
        return metrics["f1"]
    
    def run_evaluate(self, test):
        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for words, labels in minibathes(test, self.config.batch_size):
            labels_pred, sequence_lengths = self.predict_batch(words)
            
            for lab, lab_pred, length in zip (labels, labels_pred, sequence_lengths):
                lab = lab[:length]
                lab_pred = lab_pred[:length]
                accs += [a==b for (a,b) in zip(lab, lab_pred)]
                
                lab_chunks      = set(get_chunks(lab, self.config.vocab_tags))
                lab_pred_chunks = set(get_chunks(lab_pred,
                                                 self.config.vocab_tags))

                correct_preds += len(lab_chunks & lab_pred_chunks)
                total_preds   += len(lab_pred_chunks)
                total_correct += len(lab_chunks)   
                
        p   = correct_preds / total_preds if correct_preds > 0 else 0
        r   = correct_preds / total_correct if correct_preds > 0 else 0
        f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)

        return {"acc": 100*acc, "f1": 100*f1}   

In [None]:
class BaseModel(object):
    """deal with generic NN model op"""
    
    def __init__(self, config):
        self.config = config
        self.sess = None
        self.saver = None
        
    def add_train_op(self, lr_method, lr, loss, clip = -1):
        
        _lr_m = lr_method.lower() # lower to make sure
        
        with tf.variable_scope("train_step"):
            if _lr_m == 'adam': # sgd method
                optimizer = tf.train.AdamOptimizer(lr)
            elif _lr_m == 'adagrad':
                optimizer = tf.train.AdagradOptimizer(lr)
            elif _lr_m == 'sgd':
                optimizer = tf.train.GradientDescentOptimizer(lr)
            elif _lr_m == 'rmsprop':
                optimizer = tf.train.RMSPropOptimizer(lr)
            else:
                raise NotImplementedError("Unknown method {}".format(_lr_m))

            if clip > 0: # gradient clipping if clip is positive
                grads, vs     = zip(*optimizer.compute_gradients(loss))
                grads, gnorm  = tf.clip_by_global_norm(grads, clip)
                self.train_op = optimizer.apply_gradients(zip(grads, vs))
            else:
                self.train_op = optimizer.minimize(loss) 
                
    def initialize_session(self):
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        
    def close_session(self):
        self.sess.close()
        
    def train(self, train, dev):
        best_score = 0
        nepoch_no_imprv = 0
        
        for epoch in range(self.config.nepochs):
            score = self.run_epoch(train, dev, epoch)
            
            # early stopping 
            if score >= best_score:
                nepoch_no_imprv = 0
                best_score = score
            else:
                nepoch_no_imprv += 1
                if nepoch_no_imprv >= self.config.nepoch_no_imprv:
                    break
                    
    def evalute(self, test):
        metrics = self.run_evaluate(test)

# test

#### config

In [1]:
class Config():
    # training
    train_embeddings = False
    n_epochs          = 15
    dropout          = 0.5
    batch_size       = 20
    lr_method        = "adam"
    lr               = 0.001
    lr_decay         = 0.9
    clip             = -1 # if negative, no clipping
    n_epoch_no_imprv  = 3

In [2]:
class Model(object):
    """CRF-BiLSTM NER model"""

    def __init__(self, config):
        """
		Load the hyperparams in config

        """
        self.config = config

In [4]:
test_model = Model(Config)
test_model.config.batch_size

20