import importlib
importlib.reload()

In [1]:
import numpy as np
import os

import tensorflow as tf

from model.config import Config


In [2]:
#from model.data_utils import CoNLLDataset
from model.DataSet import DataSet
from model.ner_model import NERModel

## Data Prepare Part


In [5]:
from model.DataSet import build_vocabs, UNK, NUM, \
    build_glove_vocab, write_vocab, load_vocab, \
    export_trimmed_glove_vectors, process_vocab

In [6]:
config = Config(load=False)

dev = DataSet(config.filename_dev)
test = DataSet(config.filename_test)
train = DataSet(config.filename_train)

In [7]:
"""
@1 Add vocab from train, dev, test
@2 use & to select the concide
"""

vocab_words, vocab_tags = build_vocabs([train, dev, test])
vocab_glove = build_glove_vocab(config.filename_glove)

Building vocab...
wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']
- done. 8168 tokens
Building vocab...
- done. 400000 tokens


In [8]:
UNK = "$UNK$"
NUM = "$NUM$"
NONE = "O"
vocab = vocab_words & vocab_glove
vocab.add(UNK)
vocab.add(NUM)

In [9]:
"""check config.vocab"""
vocab_tags

{'b-art',
 'b-eve',
 'b-geo',
 'b-gpe',
 'b-nat',
 'b-org',
 'b-per',
 'b-tim',
 'i-art',
 'i-eve',
 'i-geo',
 'i-gpe',
 'i-nat',
 'i-org',
 'i-per',
 'i-tim',
 'o'}

In [10]:
"""Write word & tags vocab, file is txt file in data folder"""

write_vocab(vocab, config.filename_words)
write_vocab(vocab_tags, config.filename_tags)

Writing vocab...
- done. 7936 tokens
Writing vocab...
- done. 17 tokens


In [12]:
vocab

{'maltreating': 0,
 'journalism': 1,
 'planting': 2,
 'scheduled': 3,
 'charging': 3970,
 '1890s': 4,
 'conversion': 3971,
 'scout': 6,
 'quo': 5343,
 'behind': 3972,
 'farris': 3974,
 'forcing': 3976,
 'benedict': 7,
 'dust': 8,
 '100-million': 10,
 'apparently': 3978,
 'shirin': 11,
 'deif': 12,
 'baluchistan': 13,
 '4.6': 3979,
 'rodriguez': 3980,
 'nld': 3981,
 'entrapment': 1348,
 'luis': 14,
 'live': 15,
 'respond': 3982,
 'unmasking': 16,
 'belgium': 17,
 'west': 3984,
 'tolerant': 3985,
 'currently': 3302,
 'capture': 3986,
 'chilean': 3987,
 'truck': 18,
 'administrator': 3989,
 'sandbags': 3990,
 'timothy': 19,
 'counter-revolutionary': 665,
 'downtown': 3304,
 'wrongdoing': 3992,
 'nightclub': 5,
 'garnering': 20,
 'using': 21,
 'chemicals': 3993,
 'barno': 3994,
 'pedro': 3995,
 'site': 23,
 'selling': 3997,
 'al-alam': 6992,
 'heightened': 24,
 'zuma': 3975,
 'smoking': 25,
 'ending': 26,
 'century': 3998,
 'oft': 30,
 'campaign': 29,
 'costello': 4000,
 'follows': 31,
 'm

In [11]:
vocab = load_vocab(config.filename_words)
export_trimmed_glove_vectors(vocab, config.filename_glove,
                             config.filename_trimmed, config.dim_word)

## Model Construction Part

In [28]:
"""add place holder"""

tf.reset_default_graph()
config = Config()

word_ids = tf.placeholder(tf.int32, shape=[None, None],
                name="word_ids")

# shape = (batch size)
sequence_lengths = tf.placeholder(tf.int32, shape=[None],
                name="sequence_lengths")

# shape = (batch size, max length of sentence, max length of word)
char_ids = tf.placeholder(tf.int32, shape=[None, None, None],
                name="char_ids")

# shape = (batch_size, max_length of sentence)
word_lengths = tf.placeholder(tf.int32, shape=[None, None],
                name="word_lengths")

# shape = (batch size, max length of sentence in batch)
labels = tf.placeholder(tf.int32, shape=[None, None],
                name="labels")

# hyper parameters
dropout = tf.placeholder(dtype=tf.float32, shape=[],
                name="dropout")
lr = tf.placeholder(dtype=tf.float32, shape=[],
                name="lr")

In [29]:
"""add_word_embeddings_op"""
with tf.variable_scope("words"):

    _word_embeddings = tf.Variable(
            config.embeddings,
            name="_word_embeddings",
            dtype=tf.float32,
            trainable=config.train_embeddings)

    word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
            word_ids, name="word_embeddings")


word_embeddings =  tf.nn.dropout(word_embeddings, dropout)


In [30]:
"""logits scope"""
with tf.variable_scope("bi-lstm"):
    cell_fw = tf.contrib.rnn.LSTMCell(config.hidden_size_lstm)
    cell_bw = tf.contrib.rnn.LSTMCell(config.hidden_size_lstm)
    (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
            cell_fw, cell_bw, word_embeddings,
            sequence_length=sequence_lengths, dtype=tf.float32)
    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.nn.dropout(output, dropout)

with tf.variable_scope("proj"):
    W = tf.get_variable("W", dtype=tf.float32,
            shape=[2*config.hidden_size_lstm, config.ntags])

    b = tf.get_variable("b", shape=[config.ntags],
            dtype=tf.float32, initializer=tf.zeros_initializer())

    nsteps = tf.shape(output)[1]
    output = tf.reshape(output, [-1, 2*config.hidden_size_lstm])
    pred = tf.matmul(output, W) + b
    logits = tf.reshape(pred, [-1, nsteps, config.ntags])


In [31]:
"""loss scope"""
log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
        logits, labels, sequence_lengths)
trans_params = trans_params # need to evaluate it for decoding
loss = tf.reduce_mean(-log_likelihood)
#tf.summary.scalar("loss", loss) # for tensorboard


In [32]:
"""train scope"""

with tf.variable_scope("train_step"):
    optimizer = tf.train.AdamOptimizer(config.lr)

    if config.clip > 0: # gradient clipping if clip is positive
        grads, vs     = zip(*optimizer.compute_gradients(loss))
        grads, gnorm  = tf.clip_by_global_norm(grads, config.clip)
        train_op = optimizer.apply_gradients(zip(grads, vs))
    else:
        train_op = optimizer.minimize(loss)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [33]:
sess = tf.Session()

sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()

In [34]:
sess.graph

<tensorflow.python.framework.ops.Graph at 0x182afa52e8>

## Training prepare part

In [4]:
tf.reset_default_graph()

file_path = "data/jason.txt"

config = Config()

# build model
model = NERModel(config)
model.build()

train = DataSet(file_path,vocab_words=config.vocab_words,vocab_tags=config.vocab_tags)
model.train(train,train)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Initializing tf session
Epoch 1 out of 15


wrong line,line content: ['1786', '77.0', ',', 'O']
  3/150 [..............................] - ETA: 36s - train loss: 57.9129wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 87.44 - f1 28.22
- new best score!
Epoch 2 out of 15


  3/150 [..............................] - ETA: 22s - train loss: 10.9565wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 89.69 - f1 45.87
- new best score!
Epoch 3 out of 15


  3/150 [..............................] - ETA: 21s - train loss: 9.8816 wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 90.53 - f1 50.55
- new best score!
Epoch 4 out of 15


  3/150 [..............................] - ETA: 21s - train loss: 9.2834wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 90.77 - f1 52.44
- new best score!
Epoch 5 out of 15


  3/150 [..............................] - ETA: 23s - train loss: 8.3097wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 91.47 - f1 56.04
- new best score!
Epoch 6 out of 15


  3/150 [..............................] - ETA: 21s - train loss: 7.8470wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 92.20 - f1 59.97
- new best score!
Epoch 7 out of 15


  3/150 [..............................] - ETA: 26s - train loss: 7.5058wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 92.69 - f1 62.67
- new best score!
Epoch 8 out of 15


  3/150 [..............................] - ETA: 23s - train loss: 6.8873wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 93.09 - f1 64.15
- new best score!
Epoch 9 out of 15


  3/150 [..............................] - ETA: 22s - train loss: 7.2025wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 93.41 - f1 65.95
- new best score!
Epoch 10 out of 15


  3/150 [..............................] - ETA: 24s - train loss: 5.9217wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 93.60 - f1 66.57
- new best score!
Epoch 11 out of 15


  3/150 [..............................] - ETA: 22s - train loss: 6.2511wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 93.70 - f1 67.05
- new best score!
Epoch 12 out of 15


  3/150 [..............................] - ETA: 21s - train loss: 6.1915wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 93.85 - f1 67.73
- new best score!
Epoch 13 out of 15


  3/150 [..............................] - ETA: 21s - train loss: 6.5244wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 94.03 - f1 68.44
- new best score!
Epoch 14 out of 15


  3/150 [..............................] - ETA: 22s - train loss: 5.8450wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 94.10 - f1 68.80
- new best score!
Epoch 15 out of 15


  3/150 [..............................] - ETA: 21s - train loss: 6.3077wrong line,line content: ['1786', '77.0', ',', 'O']
wrong line,line content: ['1786', '77.0', ',', 'O']


acc 94.18 - f1 69.19
- new best score!
