# NER Model

In [1]:
from datetime import datetime
import os

from clr_callback import *
import keras
from keras import regularizers
from keras.optimizers import *
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Input
# from tensorflow.python.keras.optimizers import *
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
import sys
import tensorflow as tf
from tensorflow.python.keras.layers import Flatten, Dense, Embedding, Dropout, Bidirectional, LSTM, Concatenate, Reshape, Lambda, Input, Activation, Masking
from tensorflow.python.keras.layers import concatenate
from tensorflow.python.keras.models import Model, load_model
from tensorflow.python.keras.optimizers import Adam, SGD
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import one_hot
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier
from keras_contrib.layers import CRF
from model.data_utils import get_trimmed_glove_vectors, load_vocab, get_processing_word, CoNLLDataset, get_trimmed_glove_vectors, load_vocab, get_processing_word, minibatches, get_chunks, pad_sequences
from model.ner_model import NERModel

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#download new data
#source: https://github.com/synalp/NER
train_filename = "data/coNLL/eng/eng.train.iob"
dev_filename = "data/coNLL/eng/eng.testa.iob"
test_filename = "data/coNLL/eng/eng.testb.iob"

In [3]:
use_chars = True
max_iter = None

In [4]:
vocab_words = load_vocab("data/words.txt")
vocab_words

{'$NUM$': 13,
 '$UNK$': 9,
 '.': 16,
 'a': 11,
 'actor': 2,
 'american': 4,
 'an': 8,
 'and': 0,
 'economic': 12,
 'european': 22,
 'french': 7,
 'in': 5,
 'is': 1,
 'jean': 10,
 'lives': 21,
 'new': 19,
 'oscar': 20,
 'pierre': 14,
 'political': 17,
 'the': 3,
 'union': 6,
 'won': 18,
 'york': 15}

In [5]:
vocab_chars = load_vocab("data/chars.txt")
vocab_chars

{'.': 23,
 'A': 5,
 'E': 17,
 'F': 9,
 'J': 18,
 'N': 8,
 'P': 25,
 'T': 20,
 'U': 21,
 'Y': 3,
 'a': 15,
 'c': 27,
 'd': 11,
 'e': 7,
 'h': 24,
 'i': 10,
 'k': 16,
 'l': 1,
 'm': 26,
 'n': 12,
 'o': 14,
 'p': 22,
 'r': 4,
 's': 13,
 't': 6,
 'u': 19,
 'v': 2,
 'w': 0}

In [6]:
# NOTE: this order could be arbitrary, with values in the interval [0, num_tags]
# ALSO: there should be a difference between a null tag, and a padded label
vocab_tags = load_vocab("data/tags.txt")
vocab_tags

{'B-LOC': 7,
 'B-MISC': 3,
 'B-ORG': 5,
 'B-PER': 1,
 'I-LOC': 8,
 'I-MISC': 4,
 'I-ORG': 6,
 'I-PER': 2,
 'O': 0}

In [7]:
n_words = len(vocab_words)

In [8]:
n_char = len(vocab_chars)
n_char

28

In [9]:
n_tags = (len(vocab_tags)+1) #+1 if different vocab_tags
n_tags

10

In [10]:
#coNLL data for validation
dev = CoNLLDataset(dev_filename, get_processing_word(vocab_words, vocab_chars,lowercase=True, chars=use_chars),
                  get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter)

In [11]:
# coNLL data for train
train = CoNLLDataset(train_filename, get_processing_word(vocab_words, vocab_chars,lowercase=True, chars=use_chars),
                  get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter)

In [12]:
# coNLL data for test
test = CoNLLDataset(test_filename, get_processing_word(vocab_words, vocab_chars,lowercase=True, chars=use_chars),
                  get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter)

In [13]:
def get_glove_vocab(filename):
    """Load vocab from file
    Args:
        filename: path to the glove vectors
    Returns:
        vocab: set() of strings
    """
    print("Building vocab...")
    vocab = set()
    with open(filename) as f:
        for line in f:
            word = line.strip().split(' ')[0]
            vocab.add(word)
    print("- done. {} tokens".format(len(vocab)))
    return vocab

In [14]:
emb_data = np.load("data/glove.6B.300d.trimmed.npz")

In [15]:
embeddings = emb_data["embeddings"]
type(embeddings)

numpy.ndarray

In [16]:
dim_word = 300 #End to end paper uses 30
dim_char = 100

In [17]:
hidden_size_char = 100 # lstm on chars
hidden_size_lstm = 300 # lstm on word embeddings

In [18]:
nepochs = 75 #End to end paper saw best results at 50 epochs
lr = 0.0105 #0.001 #End to end uses learning rate of 0.01 for POS tagging and 0.015 for NER where lr is updated on each epoch with decay rate 0.05
lr_decay = 0.0005 #lr/nepochs #0.05 #GG uses 0.9; paper uses 0.05
batch_size = 10 #20 #End to end paper uses 10 #eval at 32

In [19]:
# TODO: make use of minibatches with fit_generator
# for i, (words, labels) in enumerate(minibatches(train, batch_size)):
words, labels = list(minibatches(train, len(train)))[0]  # NOTE: len(train) will return entire dataset!
#GG's version
char_ids, word_ids = zip(*words)
word_ids, sequence_lengths = pad_sequences(word_ids, pad_tok=9) #word_ids = vocab_chars?
char_ids, word_lengths = pad_sequences(char_ids, pad_tok=9, nlevels=2)
labels, _ = pad_sequences(labels, pad_tok=9)

In [20]:
#validation/dev
words_dev, labels_dev = list(minibatches(dev, len(dev)))[0]  
char_ids_dev, word_ids_dev = zip(*words_dev)
word_ids_dev, sequence_lengths_dev = pad_sequences(word_ids_dev, pad_tok=9)
char_ids_dev, word_lengths_dev = pad_sequences(char_ids_dev, pad_tok=9, nlevels=2)
labels_dev, _ = pad_sequences(labels_dev, pad_tok=9)

In [21]:
#test
words_test, labels_test = list(minibatches(test, len(test)))[0]  
char_ids_test, word_ids_test = zip(*words_test)
word_ids_test, sequence_lengths_test = pad_sequences(word_ids_test, pad_tok=9)
char_ids_test, word_lengths_test = pad_sequences(char_ids_test, pad_tok=9, nlevels=2)
labels_test, _ = pad_sequences(labels_test, pad_tok=9)

In [22]:
dropout = 0.5 # needs to be set before Dropout function- GG 0.5

In [23]:
word_emb_input = Input((None,))
mask_word = Masking(mask_value=9)(word_emb_input)
word_emb_output = Embedding(n_words, dim_word, weights=[embeddings], trainable=False)(mask_word)
# word_emb_output = Dropout(dropout)(word_emb_output)

In [24]:
#end to end paper claims to have applied dropout layer on character embeddings before inputting to a CNN in addition to before both layers of BLSTM
# char_emb_input = Input((max_seq_length, max_word_length)) 
char_emb_input = Input((None, None))
#comes in as sentences, words, characters and for the character part we want to just operate it over the character sentence by number of words and seq of characters so reshape so we have words by characters
char_emb_output = Lambda(lambda x: tf.keras.backend.reshape(x, (-1, tf.keras.backend.shape(x)[-1])))(char_emb_input)
mask_char = Masking(mask_value=9)(char_emb_output)  # TODO: make -1 a variable
char_emb_output = Embedding(n_char, dim_char)(mask_char) #need weights here?
# 2 sided LSTM below that we can change with forward and backward to see which is better performing
# char_emb_output = Bidirectional(LSTM(hidden_size_char, return_sequences=False))(char_emb_output)
char_emb_output = Dropout(dropout)(char_emb_output)
fw_LSTM = LSTM(hidden_size_char, return_sequences=False)(char_emb_output) #is this right?
bw_LSTM = LSTM(hidden_size_char, return_sequences=False, go_backwards=True)(char_emb_output)
char_emb_output = concatenate([fw_LSTM, bw_LSTM])
char_emb_output = Dropout(dropout)(char_emb_output)
char_emb_output = Lambda(lambda x, z: tf.keras.backend.reshape(x, (-1, tf.shape(z)[1], 2 * hidden_size_char)), arguments={"z": word_emb_input})(char_emb_output)

In [25]:
#concatenates word embedding and character embedding
x = concatenate([word_emb_output, char_emb_output])

In [26]:
x = Dropout(dropout)(x)
x = Bidirectional(LSTM(hidden_size_lstm, return_sequences=True))(x)  #should we turn this into two layers (fw and bw)?
# fw_LSTM_2 = LSTM(hidden_size_lstm, return_sequences=True)(x) #is this right?
# bw_LSTM_2 = LSTM(hidden_size_lstm, return_sequences=True, go_backwards=True)(x)
# x = concatenate([fw_LSTM_2, bw_LSTM_2])
x = Dropout(dropout)(x)
# scores = Dense(n_tags, activity_regularizer=regularizers.l1(0.001))(x) 
scores = Dense(n_tags)(x) 
softmax = Activation("softmax")(scores)
crf_layer = CRF(n_tags)
# crf = crf_layer(scores) #should we add this to attach to the softmax model? with SGD and gradiet clipping of 5.0?

In [27]:
model_softmax = Model([word_emb_input, char_emb_input], softmax) #should these be input

In [28]:
# model_crf = Model([word_emb_input, char_emb_input], crf) #should these be input

In [29]:
model_softmax.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, None)   0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, None)         0           input_2[0][0]                    
__________________________________________________________________________________________________
masking_2 (Masking)             (None, None)         0           lambda_1[0][0]                   
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 100)    2800        masking_2[0][0]                  
__________________________________________________________________________________________________
dropout_1 

In [31]:
# model_crf.summary()

In [32]:
adam_op = Adam(lr=lr, decay=lr_decay)
# sgd = SGD(lr=lr, momentum=momentum, decay=lr_decay)

In [33]:
model_softmax.compile(optimizer=adam_op, loss='categorical_crossentropy', metrics=['accuracy'])

In [34]:
# model_crf.compile(loss=crf_layer.loss_function, optimizer=adam_op, metrics=["accuracy"])

In [35]:
#train
char_ids_arr = np.array(char_ids)
word_ids_arr = np.array(word_ids)
labels_arr = np.array(labels)
labels_arr_one_hot = np.eye(10)[labels] #10 if vocab_tags are different

In [36]:
#dev/validation
char_ids_arr_dev = np.array(char_ids_dev)
word_ids_arr_dev = np.array(word_ids_dev)
labels_arr_dev = np.array(labels_dev)
labels_arr_one_hot_dev = np.eye(10)[labels_dev] #10 if vocab_tags are different

In [37]:
#test
char_ids_arr_test = np.array(char_ids_test)
word_ids_arr_test = np.array(word_ids_test)
labels_arr_test = np.array(labels_test)
labels_arr_one_hot_test = np.eye(10)[labels_test] #10 if vocab_tags are different

In [38]:
# date = datetime.strftime(datetime.today(), "%y%m%d_%H%M%S")
# base_dir = f"models/{date}"
# if not os.path.exists(base_dir):
#     os.makedirs(base_dir)
# model_checkpoint = tf.keras.callbacks.ModelCheckpoint(base_dir + "/{val_loss}_{epoch:03d}.hdf5")
# tb_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs')
# callbacks = [model_checkpoint, tb_callback]
# callbacks = [clr]
# print(base_dir)

In [40]:
# Add callbacks:
# early stopping and saving best parameters
# learning rate decay
# tensorboard
# number of epochs without improving is 0 (for early stopping)
# could add gradient clipping (optional)
model_softmax.fit([word_ids_arr, char_ids_arr], labels_arr_one_hot, batch_size=batch_size, epochs=nepochs, validation_data=([word_ids_arr_dev, char_ids_arr_dev], labels_arr_one_hot_dev)) # validation_split=0.3
#fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0.0, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None)
#fit_generator(self, generator, steps_per_epoch=None, epochs=1, verbose=1, callbacks=None, validation_data=None, validation_steps=None, class_weight=None, max_queue_size=10, workers=1, use_multiprocessing=False, shuffle=True, initial_epoch=0)

# model_softmax.save(f"{base_dir}/train_softmax.hdf5") #final_softmax

Train on 14041 samples, validate on 3250 samples
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


<tensorflow.python.keras._impl.keras.callbacks.History at 0x7ff89c0290f0>

In [41]:
model_softmax.save_weights("softmax_test_5_31_1.hdf5")

In [42]:
# model_crf.fit([word_ids_arr, char_ids_arr], labels_arr_one_hot, batch_size=batch_size, epochs=nepochs, validation_split=0.33)

In [43]:
# model_crf.save("crf_with_val.hdf5")

In [44]:
# base_dir  #models/180222_215523

In [45]:
model_softmax.load_weights("softmax_test_5_31_1.hdf5")
# model_softmax.load_weights(f"{base_dir}/train_softmax.hdf5")#"models/180222_215523/final_softmax.hdf5")#"0.11342436582348703_050.hdf5")

In [46]:
# prediction_softmax = model_softmax.predict([word_ids_arr, char_ids_arr])

In [47]:
# model_crf.load_weights("crf_with_val.hdf5")

In [48]:
# prediction_crf = model_crf.predict([word_ids_arr, char_ids_arr])

In [49]:
# print(prediction_crf)

In [50]:
# evaluate the model
# scores_crf = model_crf.evaluate([word_ids_arr, char_ids_arr], labels_arr_one_hot) #x_test, y_test (when testing)
# print("%s: %.2f%%" % (model_crf.metrics_names[1], scores_crf[1]*100))

In [51]:
# scores_softmax = model_softmax.evaluate([word_ids_arr, char_ids_arr], labels_arr_one_hot) #x_test, y_test (when testing)
# print("%s: %.2f%%" % (model_softmax.metrics_names[1], scores_softmax[1]*100))

In [52]:
#train F1 evaluation

In [53]:
##Evaluate Training##
def extract_data(dataset):
    """Extract words and labels from a dataset.
    
    Args:
      dataset: A CoNLL dataset.
    
    Returns:
      Word ids, char ids, and labels, from a CoNLL dataset,
      all as NumPy arrays.
    """
    
    words, labels = list(minibatches(dataset, len(dataset)))[0]  # NOTE: len(train) will return entire dataset!
    char_ids, word_ids = zip(*words)
    
    word_ids, sequence_lengths = pad_sequences(word_ids, pad_tok=9)
    char_ids, word_lengths = pad_sequences(char_ids, pad_tok=9, nlevels=2)
    labels, _ = pad_sequences(labels, pad_tok=9)

    word_ids_arr = np.array(word_ids)
    char_ids_arr = np.array(char_ids)
    labels_arr = np.array(labels)
    # TODO: add one-hot encoding of labels
    seq_lens_arr = np.array(sequence_lengths)
    return word_ids_arr, char_ids_arr, labels_arr, seq_lens_arr


def predict_labels(model, word_ids_arr, char_ids_arr, seq_lens_arr, batch_size=32):
    """Predict labels for a set of words.
    
    Args:
      model: A Keras Model that accepts char ids and word ids
        and returns label probs.
      word_ids_arr: A NumPy array of word ids for sentences of shape
        (num sentences, max num words).
      char_ids_arr: A NumPy array of char ids for sentences of shape
        (num sentences, max num words, max num chars).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words). 
    
    Returns:
      A NumPy array of shape (num sentences, num words)
      containing the predicted tags for each word.
    """
#     model.load_weights("softmax_with_masking_nine.hdf5")
    labels_prob_arr = model.predict([word_ids_arr, char_ids_arr], batch_size) #shape(num sentences, max num words, num tags)
#     labels_prob_arr = model.predict(word_ids_arr, batch_size) #shape(num sentences, max num words, num tags) #DELETE
    labels_pred_arr = np.argmax(labels_prob_arr, -1) 
    return labels_pred_arr


def compute_metrics(labels_arr, labels_pred_arr, seq_lens_arr, vocab_tags): #commented out to play with it below but this is the og
    """Compute accuracy and F1.
    
    Args:
      labels_arr: A NumPy array of correct tags of shape
        (num sentences, max num words).
      labels_pred_arr: A NumPy array of predicted tags of
        shape (num sentences, max num words).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words).
      vocab_tags: Dictionary of tag strings to tag numbers.
      
    Returns:
      Dictionary with accuracy `acc` and F1 score `f1`.
    """
    accs = []
    correct_preds, total_correct, total_preds = 0., 0., 0.

    for lab, lab_pred, seq_len in zip(labels_arr, labels_pred_arr, seq_lens_arr):
        # NOTE: labels & predictions are padded to the maximum number of words
        # in the batch.  Here, we use the actual sentence lengths to select out
        # the actual labels and corresponding predictions.
        lab = lab[:seq_len]
        lab_pred = lab_pred[:seq_len]
        for n, i in enumerate(lab_pred):
            if i == 9:
                lab_pred[n] = 0
        
        accs += [a==b for (a, b) in zip(lab, lab_pred)]
        
        lab_chunks      = set(get_chunks(lab, vocab_tags))
        lab_pred_chunks = set(get_chunks(lab_pred, vocab_tags))

        correct_preds += len(lab_chunks & lab_pred_chunks)
        total_preds   += len(lab_pred_chunks)
        total_correct += len(lab_chunks)

        
    p   = correct_preds / total_preds if total_preds > 0 else 0 
    r   = correct_preds / total_correct if total_correct > 0 else 0
    f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
    acc = np.mean(accs)

    print ({"precision": p})
    print ({"recall": r})
    print ({"total_correct": total_correct})
    return {"acc": 100*acc, "f1": 100*f1}

In [54]:
word_ids_arr, char_ids_arr, labels_arr, seq_lens_arr = extract_data(train) 
labels_pred_arr = predict_labels(model_softmax, word_ids_arr, char_ids_arr, seq_lens_arr)
metrics = compute_metrics(labels_arr, labels_pred_arr, seq_lens_arr, vocab_tags)
print(metrics)

{'precision': 0.8365480907388863}
{'recall': 0.8176092599685093}
{'total_correct': 23499.0}
{'acc': 97.19920833312871, 'f1': 82.69702578229243}


In [55]:
##Evaluate Dev##
def extract_data(dataset):
    """Extract words and labels from a dataset.
    
    Args:
      dataset: A CoNLL dataset.
    
    Returns:
      Word ids, char ids, and labels, from a CoNLL dataset,
      all as NumPy arrays.
    """
    
    #validation/dev
    words_dev, labels_dev = list(minibatches(dev, len(dev)))[0]  
    char_ids_dev, word_ids_dev = zip(*words_dev)
    word_ids_dev, sequence_lengths_dev = pad_sequences(word_ids_dev, pad_tok=9)
    char_ids_dev, word_lengths_dev = pad_sequences(char_ids_dev, pad_tok=9, nlevels=2)
    labels_dev, _ = pad_sequences(labels_dev, pad_tok=9)

    
    word_ids_arr_dev = np.array(word_ids_dev)
    char_ids_arr_dev = np.array(char_ids_dev)
    labels_arr_dev = np.array(labels_dev)
    # TODO: add one-hot encoding of labels
    seq_lens_arr_dev = np.array(sequence_lengths_dev)
    return word_ids_arr_dev, char_ids_arr_dev, labels_arr_dev, seq_lens_arr_dev


def predict_labels(model, word_ids_arr_dev, char_ids_arr_dev, seq_lens_arr_dev, batch_size=32):
    """Predict labels for a set of words.
    
    Args:
      model: A Keras Model that accepts char ids and word ids
        and returns label probs.
      word_ids_arr: A NumPy array of word ids for sentences of shape
        (num sentences, max num words).
      char_ids_arr: A NumPy array of char ids for sentences of shape
        (num sentences, max num words, max num chars).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words). 
    
    Returns:
      A NumPy array of shape (num sentences, num words)
      containing the predicted tags for each word.
    """
#     model.load_weights("softmax_with_masking_nine.hdf5")
    labels_prob_arr_dev = model.predict([word_ids_arr_dev, char_ids_arr_dev], batch_size) #shape(num sentences, max num words, num tags)
#     labels_prob_arr = model.predict(word_ids_arr, batch_size) #shape(num sentences, max num words, num tags) #DELETE
    labels_pred_arr_dev = np.argmax(labels_prob_arr_dev, -1) 
    return labels_pred_arr_dev


def compute_metrics(labels_arr_dev, labels_pred_arr_dev, seq_lens_arr_dev, vocab_tags): #commented out to play with it below but this is the og
    """Compute accuracy and F1.
    
    Args:
      labels_arr: A NumPy array of correct tags of shape
        (num sentences, max num words).
      labels_pred_arr: A NumPy array of predicted tags of
        shape (num sentences, max num words).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words).
      vocab_tags: Dictionary of tag strings to tag numbers.
      
    Returns:
      Dictionary with accuracy `acc` and F1 score `f1`.
    """
    accs_dev = []
    correct_preds_dev, total_correct_dev, total_preds_dev = 0., 0., 0.

    for lab_dev, lab_pred_dev, seq_len_dev in zip(labels_arr_dev, labels_pred_arr_dev, seq_lens_arr_dev):
        # NOTE: labels & predictions are padded to the maximum number of words
        # in the batch.  Here, we use the actual sentence lengths to select out
        # the actual labels and corresponding predictions.
        lab_dev = lab_dev[:seq_len_dev]
        lab_pred_dev = lab_pred_dev[:seq_len_dev]
        for n, i in enumerate(lab_pred_dev):
            if i == 9:
                lab_pred_dev[n] = 0
        
        accs_dev += [a==b for (a, b) in zip(lab_dev, lab_pred_dev)]

        lab_chunks_dev = set(get_chunks(lab_dev, vocab_tags))
        lab_pred_chunks_dev = set(get_chunks(lab_pred_dev, vocab_tags))

        correct_preds_dev += len(lab_chunks_dev & lab_pred_chunks_dev)
        total_preds_dev   += len(lab_pred_chunks_dev)
        total_correct_dev += len(lab_chunks_dev)
        
    p_dev   = correct_preds_dev / total_preds_dev if total_preds_dev > 0 else 0 
    r_dev   = correct_preds_dev / total_correct_dev if total_correct_dev > 0 else 0
    f1_dev  = 2 * p_dev * r_dev / (p_dev + r_dev) if correct_preds_dev > 0 else 0
    acc_dev = np.mean(accs_dev)

    print ({"precision": p_dev})
    print ({"recall": r_dev})
    print ({"total_correct": total_correct_dev})
    return {"acc": 100*acc_dev, "f1": 100*f1_dev}

In [56]:
#dev
word_ids_arr_dev, char_ids_arr_dev, labels_arr_dev, seq_lens_arr_dev = extract_data(dev) 
labels_pred_arr_dev = predict_labels(model_softmax, word_ids_arr_dev, char_ids_arr_dev, seq_lens_arr_dev)
metrics = compute_metrics(labels_arr_dev, labels_pred_arr_dev, seq_lens_arr_dev, vocab_tags)
print(metrics)

{'precision': 0.7537740760020822}
{'recall': 0.7310669808145406}
{'total_correct': 5942.0}
{'acc': 95.3175499396441, 'f1': 74.2246903032892}


In [57]:
##Evaluate Test##
def extract_data(dataset):
    """Extract words and labels from a dataset.
    
    Args:
      dataset: A CoNLL dataset.
    
    Returns:
      Word ids, char ids, and labels, from a CoNLL dataset,
      all as NumPy arrays.
    """
    
    #test
    words_test, labels_test = list(minibatches(test, len(test)))[0]  
    char_ids_test, word_ids_test = zip(*words_test)
    word_ids_test, sequence_lengths_test = pad_sequences(word_ids_test, pad_tok=9)
    char_ids_test, word_lengths_test = pad_sequences(char_ids_test, pad_tok=9, nlevels=2)
    labels_test, _ = pad_sequences(labels_test, pad_tok=9)
    
    word_ids_arr_test = np.array(word_ids_test)
    char_ids_arr_test = np.array(char_ids_test)
    labels_arr_test = np.array(labels_test)
    # TODO: add one-hot encoding of labels
    seq_lens_arr_test = np.array(sequence_lengths_test)
    return word_ids_arr_test, char_ids_arr_test, labels_arr_test, seq_lens_arr_test


def predict_labels(model, word_ids_arr_test, char_ids_arr_test, seq_lens_arr_test, batch_size=32):
    """Predict labels for a set of words.
    
    Args:
      model: A Keras Model that accepts char ids and word ids
        and returns label probs.
      word_ids_arr: A NumPy array of word ids for sentences of shape
        (num sentences, max num words).
      char_ids_arr: A NumPy array of char ids for sentences of shape
        (num sentences, max num words, max num chars).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words). 
    
    Returns:
      A NumPy array of shape (num sentences, num words)
      containing the predicted tags for each word.
    """
#     model.load_weights("softmax_with_masking_nine.hdf5")
    labels_prob_arr_test = model.predict([word_ids_arr_test, char_ids_arr_test], batch_size) #shape(num sentences, max num words, num tags)
#     labels_prob_arr = model.predict(word_ids_arr, batch_size) #shape(num sentences, max num words, num tags) #DELETE
    labels_pred_arr_test = np.argmax(labels_prob_arr_test, -1) 
    return labels_pred_arr_test


def compute_metrics(labels_arr_test, labels_pred_arr_test, seq_lens_arr_test, vocab_tags): #commented out to play with it below but this is the og
    """Compute accuracy and F1.
    
    Args:
      labels_arr: A NumPy array of correct tags of shape
        (num sentences, max num words).
      labels_pred_arr: A NumPy array of predicted tags of
        shape (num sentences, max num words).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words).
      vocab_tags: Dictionary of tag strings to tag numbers.
      
    Returns:
      Dictionary with accuracy `acc` and F1 score `f1`.
    """
    accs_test = []
    correct_preds_test, total_correct_test, total_preds_test = 0., 0., 0.

    for lab_test, lab_pred_test, seq_len_test in zip(labels_arr_test, labels_pred_arr_test, seq_lens_arr_test):
        # NOTE: labels & predictions are padded to the maximum number of words
        # in the batch.  Here, we use the actual sentence lengths to select out
        # the actual labels and corresponding predictions.
        lab_test = lab_test[:seq_len_test]
        lab_pred_test = lab_pred_test[:seq_len_test]
        for n, i in enumerate(lab_pred_test):
            if i == 9:
                lab_pred_test[n] = 0
        
        accs_test += [a==b for (a, b) in zip(lab_test, lab_pred_test)]

        lab_chunks_test = set(get_chunks(lab_test, vocab_tags))
        lab_pred_chunks_test = set(get_chunks(lab_pred_test, vocab_tags))

        correct_preds_test += len(lab_chunks_test & lab_pred_chunks_test)
        total_preds_test   += len(lab_pred_chunks_test)
        total_correct_test += len(lab_chunks_test)
        
    p_test   = correct_preds_test / total_preds_test if total_preds_test > 0 else 0 
    r_test   = correct_preds_test / total_correct_test if total_correct_test > 0 else 0
    f1_test  = 2 * p_test * r_test / (p_test + r_test) if correct_preds_test > 0 else 0
    acc_test = np.mean(accs_test)

    print ({"precision": p_test})
    print ({"recall": r_test})
    print ({"total_correct": total_correct_test})
    return {"acc": 100*acc_test, "f1": 100*f1_test}

In [58]:
#test
word_ids_arr_test, char_ids_arr_test, labels_arr_test, seq_lens_arr_test = extract_data(test) 
labels_pred_arr_test = predict_labels(model_softmax, word_ids_arr_test, char_ids_arr_test, seq_lens_arr_test)
metrics = compute_metrics(labels_arr_test, labels_pred_arr_test, seq_lens_arr_test, vocab_tags)
print(metrics)

{'precision': 0.6987951807228916}
{'recall': 0.6880311614730878}
{'total_correct': 5648.0}
{'acc': 94.16819209647895, 'f1': 69.33713979837631}


---

In [59]:
#predictor only seems to predict 'O' and 'B-PER' tags
# labels_pred_seq_lens_arr = (labels_pred_arr[:seq_lens_arr])
unique, counts = np.unique(labels_pred_arr, return_counts=True) #labels are a list not a numpy array
dict(zip(unique, counts))

{0: 171982,
 2: 11101,
 3: 7,
 4: 3768,
 5: 23,
 6: 9274,
 7: 15,
 8: 8184,
 9: 1382279}

In [60]:
unique_dev, counts_dev = np.unique(labels_pred_arr_dev, return_counts=True) #labels are a list not a numpy array
dict(zip(unique_dev, counts_dev))

{0: 43608, 2: 3033, 4: 1007, 6: 1895, 8: 1966, 9: 302741}

In [61]:
unique_test, counts_test = np.unique(labels_pred_arr_test, return_counts=True) #labels are a list not a numpy array
dict(zip(unique_test, counts_test))

{0: 39173, 2: 2681, 3: 1, 4: 828, 6: 2345, 8: 1757, 9: 381387}