In [1]:
import codecs
import pickle
import math
import jieba
jieba.initialize()
import re
import os
import numpy as np
import random
import tensorflow as tf
from tensorflow.contrib.crf import crf_log_likelihood
from tensorflow.contrib.crf import viterbi_decode
from tensorflow.keras.utils import to_categorical
from tensorflow import concat, placeholder, reduce_sum, Variable, expand_dims, reduce_mean
from tensorflow import abs, get_variable, variable_scope, sign, reshape, cast, squeeze, shape
from tensorflow.nn import embedding_lookup as embed
from tensorflow.nn import dropout, atrous_conv2d, conv2d, bias_add, relu, xw_plus_b
from collections import defaultdict, namedtuple
from keras.models import Model as Model_init
from keras.layers import  LSTM, Bidirectional, Input, Embedding, Concatenate, Dropout
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss as loss
from keras_contrib.metrics import crf_accuracy as accuracy
from keras.optimizers import Adam, Adadelta
from keras.utils import plot_model

Building prefix dict from the default dictionary ...
Loading model from cache /scratch/local/jieba.cache
Loading model cost 0.761 seconds.
Prefix dict has been built successfully.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# 1. Raw Data preprocessing

    1) Load data from file. 
    2) Convert IOB tagging into IOBES tagging. 
    3) Split data into training data, testing data and evaluation data.
    4) Creating item-to-sequence and sequence-to-item dictionaries.
    5) Convert chinese characters and tags into sequence.
    6) Divide data into batches with fixed length and padding samples with 0 to maximum length.

In [2]:
# read sentences from file
def load_data(file_path):
    sentences = []
    sent = []
    for line in codecs.open(file_path, 'r', 'utf8'):
        line = line.rstrip() # Remove any white spaces at the end of the string
        if not line:
            if len(sent) > 0: # a line with "\n" is used for spliting sentences
                sentences.append(sent)
                sent = []
        else:
            word_tag = line.split() # split word and tag
            if len(word_tag) == 2:
                sent.append(word_tag)
    return sentences


# convert IOB tags to IOBES tags
def convert_to_iobes_tags(sentences):   
    for index, sent in enumerate(sentences):
        iob_tags = [word_tag[-1] for word_tag in sent] # obtain iob tags of a sentence
        iobes_tags = [] # iobes tags
        for i, tag in enumerate(iob_tags): 
            if tag == 'O': # O tag is unchanged
                iobes_tags.append(tag)
            elif tag.split('-')[0] == 'B':  # B to S if an entity only includes a single word
                if i + 1 < len(iob_tags) and iob_tags[i + 1].split('-')[0] == 'I':
                    iobes_tags.append(tag)
                else:
                    iobes_tags.append(tag.replace('B-', 'S-'))
            elif tag.split('-')[0] == 'I':  # E is used for the last item of an entity (words > 2)
                if i + 1 < len(iob_tags) and iob_tags[i + 1].split('-')[0] == 'I':
                    iobes_tags.append(tag)
                else:
                    iobes_tags.append(tag.replace('I-', 'E-'))
            else:
                print('ERROR: INVALID IOB TAGGING!')  
        for word, iobes_tag in zip(sent, iobes_tags): # replace tags
            word[-1] = iobes_tag
            
            
# split sentences into train, test, dev
def split_data(sentences):
    train_div = int(len(sentences) * 0.7);  # train set divide number
    train_sentences = sentences[:train_div]
    remaining_sentences = sentences[train_div:] 
    test_div = int(len(remaining_sentences)* 0.6)  # test set divide number
    test_sentences = remaining_sentences[:test_div]
    dev_sentences = remaining_sentences[test_div:]
    return train_sentences, test_sentences, dev_sentences


# creating dictionaries from unique chinese characters to unique id
def create_char_id_convert_dict(sentences):
    char_dict = {} # a dictionary of the frequency of unique chinese characters
    chinese_chars = [[word[0] for word in sent] for sent in sentences] # get words from tupe word_tag
    for chars in chinese_chars: # get frequency of unique chinese characters
        for char in chars:
            if char not in char_dict:
                char_dict[char] = 1
            else:
                char_dict[char] += 1
    char_dict["<PAD>"] = 99999 # spacial word for padding, and intial a largest frequency
    char_dict['<UNK>'] = 99998 # spacial word for unkonwn, and intial a second largest frequency
    # sort characters by frequency (highest to samllest)
    sorted_char_dict = sorted(char_dict.items(), key=lambda x: (-x[1], x[0])) 
    # create two dictionaries: find char by id, or find id by char
    id_to_char = {index: value[0] for index, value in enumerate(sorted_char_dict)} 
    char_to_id = {value: key for key, value in id_to_char.items()}
    return id_to_char, char_to_id


# creating dictionaries from unique tag to unique id
def create_tag_id_convert_dict(sentences):
    tag_dict = {} # a dictionary of the frequency of tags
    tags = [[word[1] for word in sent] for sent in sentences]
    for tag in tags: # get frequency of unique chinese characters
        for t in tag:
            if t not in tag_dict:
                tag_dict[t] = 1
            else:
                tag_dict[t] += 1
    # sort characters by frequency (highest to samllest)
    sorted_tag_dict = sorted(tag_dict.items(), key=lambda x: (-x[1], x[0]))
    # create two dictionaries: find tag by id, or find id by tag
    id_to_tag = {index: value[0] for index, value in enumerate(sorted_tag_dict)} 
    tag_to_id = {value: key for key, value in id_to_tag.items()}
    return id_to_tag, tag_to_id


# Generated formated data for training
def get_formated_data(sentences, char_to_id, tag_to_id):
    formated_data = []
    for sent in sentences:
        sent_chars = [word[0] for word in sent] # get chinese chars
        # convert chars to id
        chars_id = [char_to_id[char if char in char_to_id else '<UNK>'] for char in sent_chars] 
        joined_sent = "".join(sent_chars) # joined all the chars into a sentence

        # Tokenize sent with Jieba to get chinese phrase feature (the start, inside, and end of a phrase)
        phrase_feature = []
        for token in jieba.cut(joined_sent):
            if len(token) == 1: # phrase_feature is 0 if a phase only has one Chinese character
                phrase_feature.append(0)
            else:
                phrase_list = [2] * len(token) # phrase_feature of middle characters in a phase is 2
                phrase_list[0] = 1 # phrase_feature of start character in a phase is 1
                phrase_list[-1] = 3 # phrase_feature of end character in a phase is 3
                phrase_feature.extend(phrase_list)

        tags_id = [tag_to_id[word[-1]] for word in sent] # convert tags to id
        formated_data.append([sent_chars, chars_id, phrase_feature, tags_id]) # formated data
    return formated_data


# Divide data into batches and padding each sample
def generate_batch_data_with_padding(data, bcount):
    batches = []
    batch_count = int(math.ceil(len(data)/ bcount)) # calulate number of batches
    # sorted list based on the length of sentences(short to long)
    sorted_len_data = sorted(train_data, key=lambda x: len(x[0]))
    for i in range(batch_count):
        batch = sorted_len_data[(i * bcount) : ((i + 1) * bcount)] # divided data into batches with fixed length
        pad_sentsents = [] # sentsents after padding
        pad_chars = [] # chinese characters after padding
        pad_phrases = [] # pahrase features after padding
        pad_tags = [] # tags after padding
        max_length = max([len(sample[0]) for sample in batch]) # find the max length of sentence in batch
        for sample in batch:
            sent, char, phrase, tag = sample 
            pad_array = [0] * (max_length - len(sent)) # padding with 0 based on the max length
            pad_sentsents.append(sent + pad_array) 
            pad_chars.append(char + pad_array)
            pad_phrases.append(phrase + pad_array)
            pad_tags.append(tag + pad_array)    
        batches.append([pad_sentsents, pad_chars, pad_phrases, pad_tags]) # get batch data
    return batches



# data processing
folder_patch = "./dataset/"  # dataset folder
data_path = folder_patch + "data.txt" # data path

sentences = load_data(data_path) # load data
print(sentences[0]) 

convert_to_iobes_tags(sentences) # convert to iobes tags
print(sentences[0]) 

train_sentences, test_sentences, dev_sentences = split_data(sentences) # split data 
print("The number of sentences of trainning data is", len(train_sentences))
print("The number of sentences of testing data is", len(test_sentences))
print("The number of sentences of development data is", len(dev_sentences))

# creates chinese characters and senquence convertion dictionaries
id_to_char, char_to_id = create_char_id_convert_dict(train_sentences) 
# creates tags and senquence convertion dictionaries
id_to_tag, tag_to_id = create_tag_id_convert_dict(train_sentences)
print("The number of unique Chinese characters is:", len(char_to_id))
print("The number of unique tag characters is:", len(tag_to_id))

train_data = get_formated_data(train_sentences, char_to_id, tag_to_id) # formated training data
test_data = get_formated_data(test_sentences, char_to_id, tag_to_id) # formated testing data
dev_data = get_formated_data(dev_sentences, char_to_id, tag_to_id) # formated edata
print(train_data[0])

with open(folder_patch + 'dict.pkl', "wb") as out_file:  # dump data for eveluation 
    pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], out_file)

# Hyper Parameters:
selected_model = 'Bi-lstm' # load the Bi-lstm model
selected_model = False # uncommented this line will load the Iterated Dilated Convolutions model
learning_rate = 0.001
channel_char = 128 #embedding output dimention for char
channel_phrase =20 #embedding output dimention for phrase
channel_lstm = 256 #input dimention for Bi-lstm
len_tags = len(tag_to_id)
len_char = len(char_to_id)

# generate batches with padding
train_batch_data = generate_batch_data_with_padding(train_data, 20) 
dev_batch_data = generate_batch_data_with_padding(dev_data, 100)
test_batch_data = generate_batch_data_with_padding(test_data, 100)

epoch_iterations = len(train_batch_data) # set the iterations per epoch
print("The number of steps per epoch is", epoch_iterations)


[['因', 'O'], ['此', 'O'], ['，', 'O'], ['这', 'O'], ['次', 'O'], ['政', 'O'], ['府', 'O'], ['危', 'O'], ['机', 'O'], ['终', 'O'], ['于', 'O'], ['得', 'O'], ['到', 'O'], ['化', 'O'], ['解', 'O'], ['，', 'O'], ['对', 'O'], ['俄', 'B-LOC'], ['罗', 'I-LOC'], ['斯', 'I-LOC'], ['来', 'O'], ['说', 'O'], ['是', 'O'], ['值', 'O'], ['得', 'O'], ['庆', 'O'], ['幸', 'O'], ['的', 'O'], ['。', 'O']]
[['因', 'O'], ['此', 'O'], ['，', 'O'], ['这', 'O'], ['次', 'O'], ['政', 'O'], ['府', 'O'], ['危', 'O'], ['机', 'O'], ['终', 'O'], ['于', 'O'], ['得', 'O'], ['到', 'O'], ['化', 'O'], ['解', 'O'], ['，', 'O'], ['对', 'O'], ['俄', 'B-LOC'], ['罗', 'I-LOC'], ['斯', 'E-LOC'], ['来', 'O'], ['说', 'O'], ['是', 'O'], ['值', 'O'], ['得', 'O'], ['庆', 'O'], ['幸', 'O'], ['的', 'O'], ['。', 'O']]
The number of sentences of trainning data is 19472
The number of sentences of testing data is 5007
The number of sentences of development data is 3339
The number of unique Chinese characters is: 4277
The number of unique tag characters is: 13
[['因', '此', '，', '这', '次', '政', '府'

2. Build the model and set up hyperparameters

In [None]:
# Bi-lstm model
# Because we take advantage of two types of features: cn_chars, cn_phrase, there will be two input layers
def Bilstm():
    crf = CRF(len(tag_to_id), sparse_target=True) # define the crf layer at first
    char_input = Input(shape=(channel_lstm,), name="Input-CN-Char") # the shape must be the max size of a sentence, it can be changed for long sentence. However, the too many neurons will slow down the model dramatically
    char_emb = Embedding(len(char_to_id),output_dim=channel_char,trainable=False,mask_zero=True)(char_input) 
    phrase_input = Input(shape=(channel_lstm,), name="Input-CN-Phrase")# the shape must be the max size of a sentence
    phrase_emb = Embedding(input_dim=4,output_dim=20,trainable=False,mask_zero=True)(phrase_input) 
    # concatenate them to makes a single vector
    merged = Concatenate(axis=-1)([char_emb, phrase_emb])
#     dropout = Dropout(0.5)(merged) #prevent from overfitting; if encounter severe overfitting, uncomment this line and change the input layer of the next layer
    lstm = Bidirectional(LSTM(100, return_sequences=True))(merged)
    dropout = Dropout(0.5)(lstm) #prevent from overfitting
    CRF_layer = crf(dropout)
    model = Model_init(inputs=[char_input, phrase_input], outputs=[CRF_layer]) #explicate the input list.
    model.summary() # plot the configure
    plot_model(model, to_file='BI-lstm model.png') #output the figure of model structure
    return model

class Model(object): # <Fast and Accurate Entity Recognition with Iterated Dilated Convolutions>
    def __init__(self):
        self.__main_setup() # model initializing
        
    def __main_setup(self):
        self.__hyper() #set up hyperparameters
        self.__placeholder() #build tensor holder
        self.__parameters() #initializing
        self.__layers() #create model
        self.__opt() #optimizer
        
    def __layers(self):
        self.__embedding() #embedding layers
        self.__dilated() # iterated dilated cnn 
        self.__loss() 
        
    def __hyper(self):
        self.learningR = learning_rate #learning rate 
        self.channel_char = channel_char  # char embedding dimention
        self.channel_phrase = channel_phrase # phrase embedding dimention
        self.len_tags = len_tags # number of tags
        self.len_chars = len_char #unique Chinese char
        self.output_channel = 0
        
    def __placeholder(self):
        self.gt = placeholder(dtype=tf.int32) #GT
        self.f1_evaluate = Variable(dtype=tf.float32,initial_value=0.0, trainable=False) #best f1 score for evaluate data
        self.f1_test = Variable(dtype=tf.float32,initial_value=0.0, trainable=False) #for test data
        self.whole_steps = Variable(dtype=tf.int32,initial_value=0, trainable=False) #steps for training process
        self.cn_char = placeholder(dtype=tf.int32) #input sentence
        self.cn_phrase = placeholder(dtype=tf.int32) #nput Chinese phrase features
        self.dropout = placeholder(dtype=tf.float32) #dropout
        
    def __parameters(self):
        self.output_channel = 0
        self.len_phrase = 4  #phrase features 0,1,2,3
        length = reduce_sum(sign(abs(self.cn_char)), reduction_indices=1)
        self.lengths = cast(length, tf.int32)
        self.batch_size = shape(self.cn_char)[0] #batch_size
        self.num_steps = shape(self.cn_char)[-1] #num_steps: total chars in each sentenc
        self.layers = [1,1,2] #based on the paper, there will be 2 types of dilated rates
        self.flag_drop = 0.5  #prevent from overfitting
        self.channel_cnn = 100 # cnn kernels numbers 
        self.minor = -1000.0
        self.model_training = True
        if self.model_training == False:
            self.flag_drop = 1.0 
        self.filters = 3 
        self.iterations = 4 #iterated 
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
        self.channel_embedding = self.channel_char + self.channel_phrase  # char channels + phrase channels
        
    def __embedding(self): # initializing for two features
        char_embeddings = get_variable('Embeddings_1',shape=[self.len_chars, self.channel_char],initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None),
            dtype=tf.float32,trainable=True)
        feature_1 = embed(char_embeddings, self.cn_char)
        phrase_embeddings = get_variable('Embeddings_2',shape=[self.len_phrase, self.channel_phrase],initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None),
            dtype=tf.float32,trainable=True)
        feature_2 = embed(phrase_embeddings, self.cn_phrase)
        self.embedding_cns = concat([feature_1,feature_2], axis=-1)
        self.embedding_cns = dropout(self.embedding_cns, self.dropout)#apply dropout 

    def __dilated(self):# Dilated Convolutions Networks
        nets_input = expand_dims(self.embedding_cns, 1)   
        initialed_weight = get_variable("kernel",shape=[1, self.filters, self.channel_embedding,self.channel_cnn],initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None))
        nets_input = conv2d(nets_input, initialed_weight, strides=[1, 1, 1, 1],  padding="SAME",name="nets_input")
        output = []
        channels = 0
        for j in range(self.iterations):  
            for i in range(len(self.layers)):# many dilated cnns can cover almost all the features
                dilated_rate = self.layers[i]
                if i == (len(self.layers) - 1):
                    last_layer = True
                else:
                    last_layer = False
                with variable_scope("DilatedConv%d" % i, reuse=tf.AUTO_REUSE):
                    weights = get_variable(name='Weights',shape=[1, self.filters, self.channel_cnn,self.channel_cnn], initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None))
                    biases = get_variable(name='Biases',shape=[self.channel_cnn]) 
                    c = atrous_conv2d(nets_input,weights, rate=dilated_rate, padding="SAME") # dilated convolution
                    c = bias_add(c, biases)
                    c = relu(c)
                    if last_layer:
                        channels += self.channel_cnn
                        output.append(c)
                    nets_input = c
        output_last = concat(values=output,axis=3) # merge the output of 4 last layers
        output_last = dropout(output_last, self.flag_drop) #add dropout layer 
#             drop dimention: the dimention which contians only one data
        output_last = squeeze(output_last, [1])
        output_last = reshape(output_last, [-1, channels]) # final features done
        self.output_channel = channels
        weight = get_variable("Weight", shape=[self.output_channel, self.len_tags],dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None))
        bias = get_variable("Bias",  initializer=tf.constant(0.0001, shape=[self.len_tags]))
#                    matmul(x, w) + b.
        result = xw_plus_b(output_last, weight, bias)
        self.result =  reshape(result, [-1, self.num_steps, self.len_tags])  # num_steps: total chars in each sentenc, len_tags: number of tags

    def __loss(self):
        # pad units 
        initial_units = concat([self.minor*tf.ones(shape=[self.batch_size, 1, self.len_tags]), tf.zeros(shape=[self.batch_size, 1, 1])], axis=-1)
        pad_units = cast(self.minor*tf.ones([self.batch_size, self.num_steps, 1]), tf.float32)
        temp = concat([self.result, pad_units], axis=-1)
        temp = concat([initial_units, temp], axis=1)
        gt = concat([cast(self.len_tags*tf.ones([self.batch_size, 1]), tf.int32), self.gt], axis=-1)
        self.transition = get_variable("transit",shape=[self.len_tags + 1, self.len_tags + 1],initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None))
        likelihood, self.transition = crf_log_likelihood(inputs=temp,tag_indices=gt,transition_params=self.transition,sequence_lengths=self.lengths+1)
        self.error = reduce_mean(likelihood*(-1))
            
    def __opt(self):
        self.optimizer = tf.train.AdamOptimizer(self.learningR)
        gradients = self.optimizer.compute_gradients(self.error) 
        limited_gradients = [[tf.clip_by_value(gra, -4, 4), va] for gra, va in gradients] # avoid gradient explosion
        self.optimize = self.optimizer.apply_gradients(limited_gradients, self.whole_steps)
            
    def evaluate(self, sess, batch_data, id_to_tag):
        transition = self.transition.eval()
        report = []
        for batch in batch_data:
            cn_sentences = batch[0]
            tags = batch[-1] #true tag
            lengths, scores = self.each_step(sess, False, batch)
            batch_paths = self.viterbi(scores, lengths, transition)
            for i in range(len(cn_sentences)):
                output = []
                sentence = cn_sentences[i][:lengths[i]]
                gt = convert_iobes_to_iob_tags([id_to_tag[int(x)] for x in tags[i][:lengths[i]]])
                predict = convert_iobes_to_iob_tags([id_to_tag[int(x)] for x in batch_paths[i][:lengths[i]]])
                for cn_char, gt, predict in zip(sentence, gt, predict):
                    output.append(" ".join([cn_char, gt, predict]))
                report.append(output)
        return report
    
    def viterbi(self, units, lengths, array): # viterbi Algorithm
        paths = []
        begin = np.asarray([[self.minor]*self.len_tags +[0]])
        for val, temp_len in zip(units, lengths):
            val = val[:temp_len]
            pad = np.ones([temp_len, 1])*(self.minor)
            units = np.concatenate([val, pad], axis=1)
            units = np.concatenate([begin, units], axis=0)
            path, _ = viterbi_decode(units, array)
            paths.append(path[1:])
        return paths
    
    def each_step(self, sess, training, batch):
        _, cn_char, cn_phrase, tags = batch
        temp_dict = {self.cn_char: np.asarray(cn_char),self.cn_phrase: np.asarray(cn_phrase), self.dropout: 1.0}
        if training:
            temp_dict[self.gt] = np.asarray(tags) #GT
            temp_dict[self.dropout] = 0.5
            whole_steps, error, _ = sess.run([self.whole_steps, self.error, self.optimize], temp_dict)
            return whole_steps, error
        else:
            lengths, units = sess.run([self.lengths, self.result], temp_dict)
            return lengths, units

# Evaluation

In [4]:

# convert IOBES tags to IOB tags
def convert_iobes_to_iob_tags(tags):
    iob_tags = []
    for index, tag in enumerate(tags):
        t = tag.split('-')[0]        
        if t == 'S': iob_tags.append(tag.replace('S-', 'B-'))
        elif t == 'E': iob_tags.append(tag.replace('E-', 'I-'))
        else: iob_tags.append(tag)          
    return iob_tags


# check if a phrase ended between the previous and current character
def check_phrase_end_tag(prev_tag, cur_tag, prev_type, cur_type):
    is_end = False

    if prev_tag == 'E': is_end = True
    if prev_tag == 'S': is_end = True
        
    if prev_tag == 'B' and (cur_tag == 'B' or cur_tag == 'S' or cur_tag == 'O'): 
        is_end = True  
    if prev_tag == 'I' and (cur_tag == 'B' or cur_tag == 'S' or cur_tag == 'O'): 
        is_end = True
        
    if prev_tag != 'O' and prev_tag != '.' and prev_type != cur_type: 
        is_end = True

    return is_end


# check if a phrase started between the previous and current character
def check_phrase_start_tag(prev_tag, cur_tag, prev_type, cur_type):
    is_start = False

    if cur_tag == 'B': chunk_start = True
    if cur_tag == 'S': chunk_start = True
    
    if prev_tag == 'E' and (cur_tag == 'E' or cur_tag == 'I'): 
        is_start = True
    if prev_tag == 'S' and (cur_tag == 'E' or cur_tag == 'I'): 
        is_start = True
    if prev_tag == 'O' and (cur_tag == 'E' or cur_tag == 'I'): 
        is_start = True
    
    if cur_tag != 'O' and cur_tag != '.' and prev_type != cur_type: 
        is_start = True

    return is_start


# calculate the precision, recall and f-score
def get_metrics(correct_count, predict_count, total_count):
    TP = correct_count 
    FP = predict_count - correct_count
    FN = total_count - correct_count

    prec = 0 if (TP + FP == 0) else (1. * TP) / (TP + FP)  # precision
    recall = 0 if (TP + FN == 0) else (1. * TP) / (TP + FN)  # recall 
    fscore = 0 if (prec + recall == 0) else (2 * prec * recall / (prec + recall)) # f-score

    Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
    return Metrics(TP, FP, FN, prec, recall, fscore)


# parse tag into IOBES tags and enetity type
def parse_tages(tag):
    matched = re.match(r'^([^-]*)-(.*)$', tag)
    return matched.groups() if matched else (tag, '')


# print precsion, recall and f-score with format
def print_report(parse_results, correct_entities, found_c_entities, found_g_entities):    
    parsed_report = []        
    metrics = get_metrics(parse_results[0], parse_results[3], parse_results[2])
    
    # all the found entities
    cg_entities = list(found_c_entities) + list(found_g_entities)    
    uniq_tags = set([e for e in cg_entities])  # unique tags

    # get metrics includes precsion, recall and f-score
    entity_metrics = {}
    for tag in uniq_tags:
        entity_metrics[tag] = get_metrics(correct_entities[tag], found_g_entities[tag], found_c_entities[tag])
     
    # print total tokens and phrases count
    result_line = []
    result_line.append('Total tokens is %d and total is phrases %d\n' % (parse_results[4], parse_results[2]))
    result_line.append('Found: %d phrases, correct: %d.\n' % (parse_results[3], parse_results[0]))
    parsed_report.append("".join(result_line))

    # formated result lines 
    if parse_results[4] > 0:
        result_line = []
        result_line.append("Accuracy:%6.2f%%, " % (100. * parse_results[1] / parse_results[4]))
        result_line.append("Precision:%6.2f%%, " % (100.* metrics.prec))
        result_line.append("Recall:%6.2f%%, " % (100. * metrics.rec))
        result_line.append("Fscore:%6.2f\n" % (100. * metrics.fscore))
        parsed_report.append("".join(result_line))

    for index, metric in sorted(entity_metrics.items()):
        result_line = []
        result_line.append('%17s: ' % index)
        result_line.append('Precision:%6.2f%%, ' % (100. * metric.prec))
        result_line.append('Recall:%6.2f%%, ' % (100. * metric.rec))
        result_line.append('Fscore:%6.2f\n' % (100. * metric.fscore))
        parsed_report.append("".join(result_line))
    
    return parsed_report


# parsed the reports
def parse_report(file_name):

    is_correct = False        # if current chunk is correct
    
    prev_ctag = 'O'           # previous correct tag
    prev_ctag_entity = ''     # previous correct entity (LOC, ORG, PER)
    prev_gtag = 'O'           # previous guessed tag
    prev_gtag_entity = ''     # previous guessed entity (LOC, ORG, PER)
    
    # 0: correct entity number, 1: correct tag number, 2: number of phrases  
    # 3: number of guessed phrases 4: number of tokens
    results = [0, 0, 0, 0, 0]

    correct_entities = defaultdict(int)
    found_c_entities = defaultdict(int)
    found_g_entities = defaultdict(int)

    with codecs.open(file_name, "r") as file:   # read file
        for line in file:
            features = line.split() # features list per line  
            if len(features) == 0: 
                features = ['-X-', 'O', 'O'] # for white space

            cur_gtag, cur_gtag_entity = parse_tages(features.pop())  # parse predicted tag
            cur_ctag, cur_ctag_entity = parse_tages(features.pop())  # parse correct tag
            chinese_char = features.pop(0)  # chinese character 

            # check if the phrase is ended between the previous and current character
            is_end_correct = check_phrase_end_tag(prev_ctag, cur_ctag, prev_ctag_entity, cur_ctag_entity)
            is_end_guessed = check_phrase_end_tag(prev_gtag, cur_gtag, prev_gtag_entity, cur_gtag_entity)

            # check if the phrase is started between the previous and current character
            is_start_correct = check_phrase_start_tag(prev_ctag, cur_ctag, prev_ctag_entity, cur_ctag_entity)
            is_start_guessed = check_phrase_start_tag(prev_gtag, cur_gtag, prev_gtag_entity, cur_gtag_entity)

            if is_correct:
                if (is_end_correct and is_end_guessed and prev_gtag_entity == prev_ctag_entity):
                    is_correct = False
                    results[0] += 1
                    correct_entities[prev_ctag_entity] += 1

                elif (is_end_correct != is_end_guessed or cur_gtag_entity != cur_ctag_entity):
                    is_correct = False

            if is_start_correct and is_start_guessed and cur_gtag_entity == cur_ctag_entity:
                is_correct = True

            if is_start_correct:
                results[2] += 1
                found_c_entities[cur_ctag_entity] += 1
            if is_start_guessed:
                results[3] += 1
                found_g_entities[cur_gtag_entity] += 1
            
            if chinese_char != '-X-':  # not empty character
                if cur_ctag == cur_gtag and cur_gtag_entity == cur_ctag_entity:
                    results[1] += 1
                results[4] += 1
            
            # get previous tags 
            prev_gtag = cur_gtag
            prev_ctag = cur_ctag
            prev_gtag_entity = cur_gtag_entity
            prev_ctag_entity = cur_ctag_entity

        if is_correct:
            results[0] += 1
            correct_entities[prev_ctag_entity] += 1

    # get parsed report,includes accuracy, precsion, recall and f-score
    parsed_report = print_report(results, correct_entities, found_c_entities, found_g_entities)   
    return parsed_report

# write predict result and parse report
def evaluate_report(train_results, file_path):
    # file name
    file_name = os.path.join(file_path, "predict_result.txt") 
    # write file
    with open(file_name, "w") as outfile:
        write_context = []
        # write line by line
        for chunk in train_results:
            for line in chunk:
                write_context.append(line + "\n")
            write_context.append("\n")
        outfile.writelines(write_context)
    # parse report
    result_lines = parse_report(file_name)
    return result_lines

# evalute data
def evaluate(tf_sess, model, data, id_to_tag):
    predict_results = model.evaluate(tf_sess, data, id_to_tag)
    parsed_lines = evaluate_report(predict_results, folder_patch)
    for line in parsed_lines:
        print(line)
    f1 = float(parsed_lines[1].strip().split()[-1])
    f1_test = model.f1_evaluate.eval()
    if f1 > f1_test:
        tf.assign(model.f1_evaluate, f1).eval()
        print("Best f1 score: {:>.3f}".format(f1))
    return f1 > f1_test

In [5]:
steps_check = 100
mode = 'training' # 'training' or 'testing'


if mode == 'training':
    with tf.Session() as sess:
        model = Model(mode)
        sess.run(tf.global_variables_initializer())
        loss_holder = []
        for i in range(4):
            for batch in train_batch_data:
                step, temp_loss = model.each_step(sess, True, batch)
                loss_holder.append(temp_loss)
                if step % steps_check == 0:
                    iteration = step // epoch_iterations + 1
                    print("Iteration Num:{} Step Num:{} of {}, ""Model Loss:{:>9.6f}".format(
                        iteration, step % epoch_iterations, epoch_iterations, np.mean(loss_holder)))
                    loss_holder = []

            evaluate(sess, model, dev_batch_data, id_to_tag)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Iteration Num:1 Step Num:100 of 974, Model Loss:11.401937
Iteration Num:1 Step Num:200 of 974, Model Loss: 8.194280
Iteration Num:1 Step Num:300 of 974, Model Loss: 7.719868
Iteration Num:1 Step Num:400 of 974, Model Loss: 7.033868
Iteration Num:1 Step Num:500 of 974, Model Loss: 7.407226
Iteration Num:1 Step Num:600 of 974, Model Loss: 7.303170
Iteration Num:1 Step Num:700 of 974, Model Loss: 7.193491
Iteration Num:1 Step Num:800 of 974, Model Loss: 7.431215
Iteration Num:1 Step Num:900 of 974, Model Loss: 7.760583
Total tokens is 66636 and total is phrases 2159
Found: 1932 phrases, correct: 1249.

Accuracy: 95.45%, Precision: 64.65%, Recall: 57.85%, Fscore: 61.06

              LOC: Precision: 54.40%, Recall: 72.61%, Fscore: 62.20

              ORG: Precision: 78.77%, Recall: 50.41%, Fscore: 61.48

              PER: Precision: 87.22%, Recall: 43.47%, Fscore: 58.02

Best f1 score: 61.060
Iteration Num:2 Step Num:26 of 974, Model Loss:10.478935
Iteration Num:2 Step Num:126 of 974, Mo