# NER Model

In [None]:
from datetime import datetime
import os

from clr_callback import *
import keras
from keras import regularizers
from keras.optimizers import *
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Input
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
import sys
import tensorflow as tf
from tensorflow.python.keras.layers import Flatten, Dense, Embedding, Dropout, Bidirectional, LSTM, Concatenate, Reshape, Lambda, Input, Activation, Masking
from tensorflow.python.keras.layers import concatenate
from tensorflow.python.keras.models import Model, load_model
from tensorflow.python.keras.optimizers import Adam, SGD
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import one_hot
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier
from keras_contrib.layers import CRF
from model.data_utils import get_trimmed_glove_vectors, load_vocab, get_processing_word, CoNLLDataset, get_trimmed_glove_vectors, load_vocab, get_processing_word, minibatches, get_chunks, pad_sequences
from model.ner_model import NERModel
from model.config import Config 

## Data and Hyper-Parameters

In [None]:
#download data
#source: https://github.com/synalp/NER
train_filename = "data/coNLL/eng/eng.train.iob"
dev_filename = "data/coNLL/eng/eng.testa.iob"
test_filename = "data/coNLL/eng/eng.testb.iob"

In [None]:
use_chars = True
max_iter = None

In [None]:
# NOTE: this order could be arbitrary, with values in the interval [0, num_tags]
# ALSO: there should be a difference between a null tag, and a padded label
vocab_tags = load_vocab("data/tags.txt")
vocab_chars = load_vocab("data/chars.txt")
vocab_words = load_vocab("data/words.txt")
n_words = len(vocab_words)
n_char = len(vocab_chars)
n_tags = (len(vocab_tags)+1) #+1 if different vocab_tags (need to add one for padding of value 9)

In [None]:
#coNLL data for validation
dev = CoNLLDataset(dev_filename, get_processing_word(vocab_words, vocab_chars,lowercase=True, chars=use_chars),
                  get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter)

In [None]:
# coNLL data for train
train = CoNLLDataset(train_filename, get_processing_word(vocab_words, vocab_chars,lowercase=True, chars=use_chars),
                  get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter)

In [None]:
# coNLL data for test
test = CoNLLDataset(test_filename, get_processing_word(vocab_words, vocab_chars,lowercase=True, chars=use_chars),
                  get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter)

In [None]:
def get_glove_vocab(filename):
    """Load vocab from file
    Args:
        filename: path to the glove vectors
    Returns:
        vocab: set() of strings
    """
    print("Building vocab...")
    vocab = set()
    with open(filename) as f:
        for line in f:
            word = line.strip().split(' ')[0]
            vocab.add(word)
    print("- done. {} tokens".format(len(vocab)))
    return vocab

In [None]:
emb_data = np.load("data/glove.6B.300d.trimmed.npz")

In [None]:
embeddings = emb_data["embeddings"]

In [None]:
### Hyperparameters
dim_word = 300 #End to end paper uses 30
dim_char = 100
hidden_size_char = 100 # lstm on chars
hidden_size_lstm = 300 # lstm on word embeddings
nepochs = 85 #End to end paper saw best results at 50 epochs
lr = 0.0105 #End to end uses learning rate of 0.01 for POS tagging and 0.015 for NER where lr is updated on each epoch with decay rate 0.05
lr_decay = 0.0005 #lr/nepochs #0.05 #GG uses 0.9; paper uses 0.05
batch_size = 10 #20 #End to end paper uses 10 #eval at 32
dropout = 0.5 # needs to be set before Dropout function- GG 0.5
### If using SGD instead of Adam:
# momentum=0.005

In [None]:
#train
# TODO: make use of minibatches with fit_generator
words, labels = list(minibatches(train, len(train)))[0]  # NOTE: len(train) will return entire dataset!
char_ids, word_ids = zip(*words)
word_ids, sequence_lengths = pad_sequences(word_ids, pad_tok=9) 
char_ids, word_lengths = pad_sequences(char_ids, pad_tok=9, nlevels=2)
labels, _ = pad_sequences(labels, pad_tok=9)

In [None]:
#validation/dev
words_dev, labels_dev = list(minibatches(dev, len(dev)))[0]  
char_ids_dev, word_ids_dev = zip(*words_dev)
word_ids_dev, sequence_lengths_dev = pad_sequences(word_ids_dev, pad_tok=9)
char_ids_dev, word_lengths_dev = pad_sequences(char_ids_dev, pad_tok=9, nlevels=2)
labels_dev, _ = pad_sequences(labels_dev, pad_tok=9)

In [None]:
#test
words_test, labels_test = list(minibatches(test, len(test)))[0]  
char_ids_test, word_ids_test = zip(*words_test)
word_ids_test, sequence_lengths_test = pad_sequences(word_ids_test, pad_tok=9)
char_ids_test, word_lengths_test = pad_sequences(char_ids_test, pad_tok=9, nlevels=2)
labels_test, _ = pad_sequences(labels_test, pad_tok=9)

## Model Architecture

In [None]:
### First 'branch' inputs word embeddings
word_emb_input = Input((None,))
mask_word = Masking(mask_value=9)(word_emb_input)
word_emb_output = Embedding(n_words, dim_word, weights=[embeddings], trainable=False)(mask_word)

In [None]:
### Second 'branch' inputs character embeddings
### Note: end to end paper claims to have applied dropout layer on character embeddings before inputting to a CNN in addition to before both layers of BLSTM
char_emb_input = Input((None, None))
### Reshape: Comes in as sentences, words, characters and for the character part we want to just operate it over the character sentence by 
### number of words and seq of characters so we reshape so that we have words by characters
char_emb_output = Lambda(lambda x: tf.keras.backend.reshape(x, (-1, tf.keras.backend.shape(x)[-1])))(char_emb_input)
mask_char = Masking(mask_value=9)(char_emb_output)  # TODO: make -1 a variable
char_emb_output = Embedding(n_char, dim_char)(mask_char) #need weights here?
char_emb_output = Dropout(dropout)(char_emb_output)
### Use bidirectional LSTM or two layers: one forward LSTM, one backward LSTM. Better results with the two layers.
char_emb_output = Bidirectional(LSTM(hidden_size_char, return_sequences=False))(char_emb_output)
# fw_LSTM = LSTM(hidden_size_char, return_sequences=False)(char_emb_output) 
# bw_LSTM = LSTM(hidden_size_char, return_sequences=False, go_backwards=True)(char_emb_output)
# char_emb_output = concatenate([fw_LSTM, bw_LSTM])
### Use dropout to prevent overfitting (as a regularizer)
char_emb_output = Dropout(dropout)(char_emb_output)
### Reshape back
char_emb_output = Lambda(lambda x, z: tf.keras.backend.reshape(x, (-1, tf.shape(z)[1], 2 * hidden_size_char)), arguments={"z": word_emb_input})(char_emb_output)

In [None]:
#concatenates word embedding and character embedding
x = concatenate([word_emb_output, char_emb_output])
x = Dropout(dropout)(x)
### Use bidirectional LSTM or two layers: one forward LSTM, one backward LSTM. Better results with bidirectional LSTM here.
x = Bidirectional(LSTM(hidden_size_lstm, return_sequences=True))(x)  #should we turn this into two layers (fw and bw)?
# fw_LSTM_2 = LSTM(hidden_size_lstm, return_sequences=True)(x) #is this right?
# bw_LSTM_2 = LSTM(hidden_size_lstm, return_sequences=True, go_backwards=True)(fw_LSTM_2)
# x = concatenate([fw_LSTM_2, bw_LSTM_2])
### Use dropout to prevent overfitting (as a regularizer)
x = Dropout(dropout)(x)
scores = Dense(n_tags)(x)
### Activation Function
softmax = Activation("softmax")(scores)
### If implementing CRF
# crf_layer = CRF(n_tags)
# crf = crf_layer(scores) 

In [None]:
model_softmax = Model([word_emb_input, char_emb_input], softmax) 

In [None]:
# model_crf = Model([word_emb_input, char_emb_input], crf) 

In [None]:
## Optimizers: Adam shows best results
adam_op = Adam(lr=lr, decay=lr_decay)
# sgd = SGD(lr=lr, momentum=momentum, decay=lr_decay)
# adagrad = Adagrad(lr=0.0105, epsilon=None, decay=0.0005)
# rms = RMSprop(lr=0.0105, rho=0.9, epsilon=None, decay=0.0005)

In [None]:
model_softmax.compile(optimizer=adam_op, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# model_crf.compile(loss=crf_layer.loss_function, optimizer=adam_op, metrics=["accuracy"])

## Change to NumPy Arrays and One Hot Encoding

In [None]:
### put into numpy arrays and put labels into one hot encoding
#train
char_ids_arr = np.array(char_ids)
word_ids_arr = np.array(word_ids)
labels_arr = np.array(labels)
labels_arr_one_hot = np.eye(10)[labels] #10 if vocab_tags are different

In [None]:
#dev/validation
char_ids_arr_dev = np.array(char_ids_dev)
word_ids_arr_dev = np.array(word_ids_dev)
labels_arr_dev = np.array(labels_dev)
labels_arr_one_hot_dev = np.eye(10)[labels_dev] #10 if vocab_tags are different

In [None]:
#test
char_ids_arr_test = np.array(char_ids_test)
word_ids_arr_test = np.array(word_ids_test)
labels_arr_test = np.array(labels_test)
labels_arr_one_hot_test = np.eye(10)[labels_test] #10 if vocab_tags are different

In [None]:
### Optional: Early Stopping and Callbacks
# date = datetime.strftime(datetime.today(), "%y%m%d_%H%M%S")
# base_dir = f"models/{date}"
# if not os.path.exists(base_dir):
#     os.makedirs(base_dir)
# model_checkpoint = tf.keras.callbacks.ModelCheckpoint(base_dir + "/{val_loss}_{epoch:03d}.hdf5")
### Add callbacks to tensorboard
# # tb_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs')
# callbacks = [model_checkpoint] #, tb_callback

## Fit Model and Save Weights

In [None]:
### Fit model.
model_softmax.fit([word_ids_arr, char_ids_arr], labels_arr_one_hot, batch_size=batch_size, epochs=nepochs, validation_data=([word_ids_arr_dev, char_ids_arr_dev], labels_arr_one_hot_dev)) # validation_data=([word_ids_arr_dev, char_ids_arr_dev], labels_arr_one_hot_dev) OR validation_split=0.3
### Parameters for reference:
### fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0.0, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None)

# model_softmax.save(f"{base_dir}/train_softmax.hdf5") #final_softmax

In [None]:
model_softmax.save_weights("softmax_test_6_17.hdf5")

In [None]:
# model_crf.fit([word_ids_arr, char_ids_arr], labels_arr_one_hot, batch_size=batch_size, epochs=nepochs, validation_split=0.33)

In [None]:
# model_crf.save("crf.hdf5")

In [None]:
model_softmax.load_weights("softmax_test_6_17.hdf5")
# model_softmax.load_weights(f"{base_dir}/train_softmax.hdf5")#"models/180222_215523/final_softmax.hdf5")#"0.11342436582348703_050.hdf5")

In [None]:
# model_crf.load_weights("crf_with_val.hdf5")

## Evaluate Model on train, dev and test:

In [None]:
##Evaluate Training##
def extract_data(dataset):
    """Extract words and labels from a dataset.
    
    Args:
      dataset: A CoNLL dataset.
    
    Returns:
      Word ids, char ids, and labels, from a CoNLL dataset,
      all as NumPy arrays.
    """
    
    words, labels = list(minibatches(dataset, len(dataset)))[0]  # NOTE: len(train) will return entire dataset!
    char_ids, word_ids = zip(*words)
    
    word_ids, sequence_lengths = pad_sequences(word_ids, pad_tok=9)
    char_ids, word_lengths = pad_sequences(char_ids, pad_tok=9, nlevels=2)
    labels, _ = pad_sequences(labels, pad_tok=9)

    word_ids_arr = np.array(word_ids)
    char_ids_arr = np.array(char_ids)
    labels_arr = np.array(labels)
    # TODO: add one-hot encoding of labels
    seq_lens_arr = np.array(sequence_lengths)
    return word_ids_arr, char_ids_arr, labels_arr, seq_lens_arr


def predict_labels(model, word_ids_arr, char_ids_arr, seq_lens_arr, batch_size=32):
    """Predict labels for a set of words.
    
    Args:
      model: A Keras Model that accepts char ids and word ids
        and returns label probs.
      word_ids_arr: A NumPy array of word ids for sentences of shape
        (num sentences, max num words).
      char_ids_arr: A NumPy array of char ids for sentences of shape
        (num sentences, max num words, max num chars).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words). 
    
    Returns:
      A NumPy array of shape (num sentences, num words)
      containing the predicted tags for each word.
    """
#     model.load_weights("softmax_with_masking_nine.hdf5")
    labels_prob_arr = model.predict([word_ids_arr, char_ids_arr], batch_size) #shape(num sentences, max num words, num tags)
#     labels_prob_arr = model.predict(word_ids_arr, batch_size) #shape(num sentences, max num words, num tags) #DELETE
    labels_pred_arr = np.argmax(labels_prob_arr, -1) 
    return labels_pred_arr


def compute_metrics(labels_arr, labels_pred_arr, seq_lens_arr, vocab_tags): #commented out to play with it below but this is the og
    """Compute accuracy and F1.
    
    Args:
      labels_arr: A NumPy array of correct tags of shape
        (num sentences, max num words).
      labels_pred_arr: A NumPy array of predicted tags of
        shape (num sentences, max num words).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words).
      vocab_tags: Dictionary of tag strings to tag numbers.
      
    Returns:
      Dictionary with accuracy `acc` and F1 score `f1`.
    """
    accs = []
    correct_preds, total_correct, total_preds = 0., 0., 0.

    for lab, lab_pred, seq_len in zip(labels_arr, labels_pred_arr, seq_lens_arr):
        # NOTE: labels & predictions are padded to the maximum number of words
        # in the batch.  Here, we use the actual sentence lengths to select out
        # the actual labels and corresponding predictions.
        lab = lab[:seq_len]
        lab_pred = lab_pred[:seq_len]
        for n, i in enumerate(lab_pred):
            if i == 9:
                lab_pred[n] = 0
        
        accs += [a==b for (a, b) in zip(lab, lab_pred)]
        
        lab_chunks      = set(get_chunks(lab, vocab_tags))
        lab_pred_chunks = set(get_chunks(lab_pred, vocab_tags))

        correct_preds += len(lab_chunks & lab_pred_chunks)
        total_preds   += len(lab_pred_chunks)
        total_correct += len(lab_chunks)

        
    p   = correct_preds / total_preds if total_preds > 0 else 0 
    r   = correct_preds / total_correct if total_correct > 0 else 0
    f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
    acc = np.mean(accs)

    print ({"precision": p})
    print ({"recall": r})
    print ({"total_correct": total_correct})
    return {"acc": 100*acc, "f1": 100*f1}

In [None]:
word_ids_arr, char_ids_arr, labels_arr, seq_lens_arr = extract_data(train) 
labels_pred_arr = predict_labels(model_softmax, word_ids_arr, char_ids_arr, seq_lens_arr)
metrics = compute_metrics(labels_arr, labels_pred_arr, seq_lens_arr, vocab_tags)
print(metrics)

In [None]:
##Evaluate Dev##
def extract_data(dataset):
    """Extract words and labels from a dataset.
    
    Args:
      dataset: A CoNLL dataset.
    
    Returns:
      Word ids, char ids, and labels, from a CoNLL dataset,
      all as NumPy arrays.
    """
    
    #validation/dev
    words_dev, labels_dev = list(minibatches(dev, len(dev)))[0]  
    char_ids_dev, word_ids_dev = zip(*words_dev)
    word_ids_dev, sequence_lengths_dev = pad_sequences(word_ids_dev, pad_tok=9)
    char_ids_dev, word_lengths_dev = pad_sequences(char_ids_dev, pad_tok=9, nlevels=2)
    labels_dev, _ = pad_sequences(labels_dev, pad_tok=9)

    
    word_ids_arr_dev = np.array(word_ids_dev)
    char_ids_arr_dev = np.array(char_ids_dev)
    labels_arr_dev = np.array(labels_dev)
    # TODO: add one-hot encoding of labels
    seq_lens_arr_dev = np.array(sequence_lengths_dev)
    return word_ids_arr_dev, char_ids_arr_dev, labels_arr_dev, seq_lens_arr_dev


def predict_labels(model, word_ids_arr_dev, char_ids_arr_dev, seq_lens_arr_dev, batch_size=32):
    """Predict labels for a set of words.
    
    Args:
      model: A Keras Model that accepts char ids and word ids
        and returns label probs.
      word_ids_arr: A NumPy array of word ids for sentences of shape
        (num sentences, max num words).
      char_ids_arr: A NumPy array of char ids for sentences of shape
        (num sentences, max num words, max num chars).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words). 
    
    Returns:
      A NumPy array of shape (num sentences, num words)
      containing the predicted tags for each word.
    """
#     model.load_weights("softmax_with_masking_nine.hdf5")
    labels_prob_arr_dev = model.predict([word_ids_arr_dev, char_ids_arr_dev], batch_size) #shape(num sentences, max num words, num tags)
#     labels_prob_arr = model.predict(word_ids_arr, batch_size) #shape(num sentences, max num words, num tags) #DELETE
    labels_pred_arr_dev = np.argmax(labels_prob_arr_dev, -1) 
    return labels_pred_arr_dev


def compute_metrics(labels_arr_dev, labels_pred_arr_dev, seq_lens_arr_dev, vocab_tags): #commented out to play with it below but this is the og
    """Compute accuracy and F1.
    
    Args:
      labels_arr: A NumPy array of correct tags of shape
        (num sentences, max num words).
      labels_pred_arr: A NumPy array of predicted tags of
        shape (num sentences, max num words).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words).
      vocab_tags: Dictionary of tag strings to tag numbers.
      
    Returns:
      Dictionary with accuracy `acc` and F1 score `f1`.
    """
    accs_dev = []
    correct_preds_dev, total_correct_dev, total_preds_dev = 0., 0., 0.

    for lab_dev, lab_pred_dev, seq_len_dev in zip(labels_arr_dev, labels_pred_arr_dev, seq_lens_arr_dev):
        # NOTE: labels & predictions are padded to the maximum number of words
        # in the batch.  Here, we use the actual sentence lengths to select out
        # the actual labels and corresponding predictions.
        lab_dev = lab_dev[:seq_len_dev]
        lab_pred_dev = lab_pred_dev[:seq_len_dev]
        for n, i in enumerate(lab_pred_dev):
            if i == 9:
                lab_pred_dev[n] = 0
        
        accs_dev += [a==b for (a, b) in zip(lab_dev, lab_pred_dev)]

        lab_chunks_dev = set(get_chunks(lab_dev, vocab_tags))
        lab_pred_chunks_dev = set(get_chunks(lab_pred_dev, vocab_tags))

        correct_preds_dev += len(lab_chunks_dev & lab_pred_chunks_dev)
        total_preds_dev   += len(lab_pred_chunks_dev)
        total_correct_dev += len(lab_chunks_dev)
        
    p_dev   = correct_preds_dev / total_preds_dev if total_preds_dev > 0 else 0 
    r_dev   = correct_preds_dev / total_correct_dev if total_correct_dev > 0 else 0
    f1_dev  = 2 * p_dev * r_dev / (p_dev + r_dev) if correct_preds_dev > 0 else 0
    acc_dev = np.mean(accs_dev)

    print ({"precision": p_dev})
    print ({"recall": r_dev})
    print ({"total_correct": total_correct_dev})
    return {"acc": 100*acc_dev, "f1": 100*f1_dev}

In [None]:
#dev
word_ids_arr_dev, char_ids_arr_dev, labels_arr_dev, seq_lens_arr_dev = extract_data(dev) 
labels_pred_arr_dev = predict_labels(model_softmax, word_ids_arr_dev, char_ids_arr_dev, seq_lens_arr_dev)
metrics = compute_metrics(labels_arr_dev, labels_pred_arr_dev, seq_lens_arr_dev, vocab_tags)
print(metrics)

In [None]:
##Evaluate Test##
def extract_data(dataset):
    """Extract words and labels from a dataset.
    
    Args:
      dataset: A CoNLL dataset.
    
    Returns:
      Word ids, char ids, and labels, from a CoNLL dataset,
      all as NumPy arrays.
    """
    
    #test
    words_test, labels_test = list(minibatches(test, len(test)))[0]  
    char_ids_test, word_ids_test = zip(*words_test)
    word_ids_test, sequence_lengths_test = pad_sequences(word_ids_test, pad_tok=9)
    char_ids_test, word_lengths_test = pad_sequences(char_ids_test, pad_tok=9, nlevels=2)
    labels_test, _ = pad_sequences(labels_test, pad_tok=9)
    
    word_ids_arr_test = np.array(word_ids_test)
    char_ids_arr_test = np.array(char_ids_test)
    labels_arr_test = np.array(labels_test)
    # TODO: add one-hot encoding of labels
    seq_lens_arr_test = np.array(sequence_lengths_test)
    return word_ids_arr_test, char_ids_arr_test, labels_arr_test, seq_lens_arr_test


def predict_labels(model, word_ids_arr_test, char_ids_arr_test, seq_lens_arr_test, batch_size=32):
    """Predict labels for a set of words.
    
    Args:
      model: A Keras Model that accepts char ids and word ids
        and returns label probs.
      word_ids_arr: A NumPy array of word ids for sentences of shape
        (num sentences, max num words).
      char_ids_arr: A NumPy array of char ids for sentences of shape
        (num sentences, max num words, max num chars).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words). 
    
    Returns:
      A NumPy array of shape (num sentences, num words)
      containing the predicted tags for each word.
    """
#     model.load_weights("softmax_with_masking_nine.hdf5")
    labels_prob_arr_test = model.predict([word_ids_arr_test, char_ids_arr_test], batch_size) #shape(num sentences, max num words, num tags)
#     labels_prob_arr = model.predict(word_ids_arr, batch_size) #shape(num sentences, max num words, num tags) #DELETE
    labels_pred_arr_test = np.argmax(labels_prob_arr_test, -1) 
    return labels_pred_arr_test


def compute_metrics(labels_arr_test, labels_pred_arr_test, seq_lens_arr_test, vocab_tags): #commented out to play with it below but this is the og
    """Compute accuracy and F1.
    
    Args:
      labels_arr: A NumPy array of correct tags of shape
        (num sentences, max num words).
      labels_pred_arr: A NumPy array of predicted tags of
        shape (num sentences, max num words).
      seq_lens_arr: A NumPy array of sentence lengths, of
        shape (num sentences, actual num words).
      vocab_tags: Dictionary of tag strings to tag numbers.
      
    Returns:
      Dictionary with accuracy `acc` and F1 score `f1`.
    """
    accs_test = []
    correct_preds_test, total_correct_test, total_preds_test = 0., 0., 0.

    for lab_test, lab_pred_test, seq_len_test in zip(labels_arr_test, labels_pred_arr_test, seq_lens_arr_test):
        # NOTE: labels & predictions are padded to the maximum number of words
        # in the batch.  Here, we use the actual sentence lengths to select out
        # the actual labels and corresponding predictions.
        lab_test = lab_test[:seq_len_test]
        lab_pred_test = lab_pred_test[:seq_len_test]
        for n, i in enumerate(lab_pred_test):
            if i == 9:
                lab_pred_test[n] = 0
        
        accs_test += [a==b for (a, b) in zip(lab_test, lab_pred_test)]

        lab_chunks_test = set(get_chunks(lab_test, vocab_tags))
        lab_pred_chunks_test = set(get_chunks(lab_pred_test, vocab_tags))

        correct_preds_test += len(lab_chunks_test & lab_pred_chunks_test)
        total_preds_test   += len(lab_pred_chunks_test)
        total_correct_test += len(lab_chunks_test)
        
    p_test   = correct_preds_test / total_preds_test if total_preds_test > 0 else 0 
    r_test   = correct_preds_test / total_correct_test if total_correct_test > 0 else 0
    f1_test  = 2 * p_test * r_test / (p_test + r_test) if correct_preds_test > 0 else 0
    acc_test = np.mean(accs_test)

    print ({"precision": p_test})
    print ({"recall": r_test})
    print ({"total_correct": total_correct_test})
    return {"acc": 100*acc_test, "f1": 100*f1_test}

In [None]:
#test
word_ids_arr_test, char_ids_arr_test, labels_arr_test, seq_lens_arr_test = extract_data(test) 
labels_pred_arr_test = predict_labels(model_softmax, word_ids_arr_test, char_ids_arr_test, seq_lens_arr_test)
metrics = compute_metrics(labels_arr_test, labels_pred_arr_test, seq_lens_arr_test, vocab_tags)
print(metrics)

---

## Experimental Section

In [None]:
### To Do: Add interactive shell
###Trial to try to have interactive shell
#From evaluate.py in GG's code:
def align_data(data):
    """Given dict with lists, creates aligned strings
    Adapted from Assignment 3 of CS224N
    Args:
        data: (dict) data["x"] = ["I", "love", "you"]
              (dict) data["y"] = ["O", "O", "O"]
    Returns:
        data_aligned: (dict) data_align["x"] = "I love you"
                           data_align["y"] = "O O    O  "
    """
    spacings = [max([len(seq[i]) for seq in data.values()])
                for i in range(len(data[list(data.keys())[0]]))]
    data_aligned = dict()

    # for each entry, create aligned string
    for key, seq in data.items():
        str_aligned = ""
        for token, spacing in zip(seq, spacings):
            str_aligned += token + " " * (spacing - len(token) + 1)

        data_aligned[key] = str_aligned

    return data_aligned



def interactive_shell(model):
    """Creates interactive shell to play with model
    Args:
        model: instance of NERModel
    """
#     model.logger.info("""
# This is an interactive mode.
# To exit, enter 'exit'.
# You can enter a sentence like
# input> I love Paris""")

    while True:
        try:
            # for python 2
            sentence = raw_input("input> ")
        except NameError:
            # for python 3
            sentence = input("input> ")

        words_raw = sentence.strip().split(" ")
#         print(words_raw)

        if words_raw == ["exit"]:
            break

        preds = model.predict(words_raw)
#         to_print = align_data({"input": words_raw, "output": preds})
        print({"input": words_raw, "output": preds})

#         for key, seq in to_print.items():
#             model.logger.info(seq)


In [None]:
interactive_shell(model_softmax)