In [1]:
%matplotlib inline

"""
Import all the necessary packages
"""
from __future__ import division, print_function, absolute_import
import tensorflow as tf
import numpy as np
import functools
import random
import argparse
from multiprocessing import Pool, Process, cpu_count
import random
import pickle as pkl
import os
import codecs
from pprint import pprint
import sys
import re
from gensim.models import Word2Vec
from IPython import display
import pandas as pd 
import seaborn as sn
import matplotlib.pyplot as plt
import codecs 
from tensorflow.python.framework import ops
import tflearn
from tensorflow.python.ops.rnn_cell import MultiRNNCell, BasicLSTMCell, GRUCell
from tensorflow.python.ops import rnn
import numpy as np
from numpy import float32
import os
import gc
from tensorflow.python.framework.ops import reset_default_graph



In [2]:
def analyse_training_file(training_file):
    """
    Analyse the amount of features in the training dataset. The data should
    look like the following:

    Mikkel feat1 feat2 ... B-PERS
    Vilstrup feat1 feat2 ... I-PERS
    is feat1 feat2 ... O
    the feat1 feat2 ... O
    author feat1 feat2 ... O
    ...

    where feat1 feat2 ... are features related to the word on the left
    It is assumed that the features never change position, and that each feature
    is seperated by a space (" ").
    """
    lines = codecs.open(training_file, encoding="utf-8").readlines()
    lines = [l for l in lines if "-DOCSTART-" not in l]

    num_features = 0
    for line in lines:
        num_features = max(num_features, len(line.split()) - 1) # find the maximum amount of features

    features = [{} for i in range(num_features)] # create a dictionary for each feature

    max_length = 0
    current_length = 0
    # Iterate through all the lines to get the categories of each feature
    for line in lines:
        if line in ['\n', '\r\n']:
            # this is the end of a sentence.
            max_length = min(MAX_SENTENCE_LENGTH, max(max_length, current_length))
            current_length = 0
            continue
        else:
            current_length +=1
            words = line.split()[1:] # discard the word on the left

            for index, word in enumerate(words):
                if not re.match("[A-Z]+", word):
                    continue
                if word not in features[index]:
                    features[index][word] = True

    max_categories = 0
    for keys in [f.keys() for f in features]:
        max_categories = max(max_categories, len(keys))

    features = [f.keys() for f in features]
    
    word_dim = WORD_DIM + 4  # We add the capital features initially
    for feature in features[:-1]:
        word_dim += len(feature)
    
    targets = len(features[-1])
    
    return num_features, max_categories, max_length, word_dim, targets, features

In [3]:
"""
Define All data parameters
"""

WORD_DIM = 300
MAX_SENTENCE_LENGTH = 30

TARGET_LANGUAGE="eng" # The language one wishes to train for
DATA_DIR="data" # path to coNLL data set


TRAINING_FILE="{}/{}.train".format(DATA_DIR, TARGET_LANGUAGE)
DEV_FILE="{}/{}.testa".format(DATA_DIR, TARGET_LANGUAGE)
VALIDATION_FILE="{}/{}.testb".format(DATA_DIR, TARGET_LANGUAGE)

EMBEDDING_DIR="embeddings" # path to the location of the word embeddings
EMBEDDING_FILE="{}/{}.bin".format(EMBEDDING_DIR, TARGET_LANGUAGE)

FEATURE_AMOUNT, \
MAX_FEATURE_CATEGORIES, \
MAX_SENTENCE_LENGTH, \
EMBEDDING_LENGTH, \
NUM_TARGETS, \
FEATURES = analyse_training_file(TRAINING_FILE)

"""
Sort the targets to ensure that the predictions vs. the targets is in order and can be reasoned about
"""
for feature in FEATURES:
    feature.sort()

print("""
Word Dimension: {word_dim}            The size of the original word embeddings
MAX_SENTENCE_LENGTH: {sent_len}        The maximum length a sentence is allowed to have
FEATURE_AMOUNT: {feat_am}              The amount of imput features including the word itself
MAX_FEATURE_CATEGORIES: {max_feat_cat}     The size of the max amount of categories within one feature (words not included)
INPUT EMBEDDING LENGTH: {emb_len}    The final size of the embedding (one-hot feature embbedings + word embedding)
NUM_TARGETS: {num_tar}                 The number of possible targets that the model should predict
""".format(word_dim=WORD_DIM, 
           sent_len=MAX_SENTENCE_LENGTH, 
           feat_am=FEATURE_AMOUNT, 
           max_feat_cat=MAX_FEATURE_CATEGORIES,
           emb_len=EMBEDDING_LENGTH,
           num_tar=NUM_TARGETS))

print("\nAvailable Extra Word Features")
for feature in FEATURES[:-1]:
    print("{}\n".format(feature))

    
"""
Ensure that O (The Non Named Entity) is the first element in the feature list
"""
o_index = FEATURES[-1].index('O')
first_element = FEATURES[-1][0]
FEATURES[-1][0] = FEATURES[-1][o_index]
FEATURES[-1][o_index] = first_element


print("\nAvailable Targets")
print(FEATURES[-1], "\n")

"""
Create a dictionary to be able to convert the onehot vectors back to their original label
"""
TARGET_LOCATION = {}
LOCATION_TARGETS = {}
for target in FEATURES[-1]:
    onehot = np.zeros(len(FEATURES[-1]))
    onehot[FEATURES[-1].index(target)] = 1
    LOCATION_TARGETS[np.argmax(onehot)] = target
    TARGET_LOCATION[target] = np.argmax(onehot)


print("Location -> Target")
pprint(TARGET_LOCATION)

print()

print("Target -> Location")
pprint(LOCATION_TARGETS)


Word Dimension: 300            The size of the original word embeddings
MAX_SENTENCE_LENGTH: 30        The maximum length a sentence is allowed to have
FEATURE_AMOUNT: 3              The amount of imput features including the word itself
MAX_FEATURE_CATEGORIES: 37     The size of the max amount of categories within one feature (words not included)
INPUT EMBEDDING LENGTH: 358    The final size of the embedding (one-hot feature embbedings + word embedding)
NUM_TARGETS: 8                 The number of possible targets that the model should predict


Available Extra Word Features
[u'CC', u'CD', u'DT', u'EX', u'FW', u'IN', u'JJ', u'JJR', u'JJS', u'LS', u'MD', u'NN', u'NNP', u'NNPS', u'NNS', u'NN|SYM', u'PDT', u'POS', u'PRP', u'PRP$', u'RB', u'RBR', u'RBS', u'RP', u'SYM', u'TO', u'UH', u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ', u'WDT', u'WP', u'WP$', u'WRB']

[u'B-ADJP', u'B-ADVP', u'B-NP', u'B-PP', u'B-SBAR', u'B-VP', u'I-ADJP', u'I-ADVP', u'I-CONJP', u'I-INTJ', u'I-LST', u'I-NP', u'I-

In [4]:
"""
Create word_vectors 
"""

print("Creating the vocabulary from the files")
vocabulary = {}
for _file in [TRAINING_FILE, DEV_FILE, VALIDATION_FILE]:
    for line in codecs.open(_file, "r", encoding="utf-8").readlines():
        if line in ['\n', '\r\n']:
            continue
        word = line.split()[0].strip()
        if not word in vocabulary:
            vocabulary[word] = None

print("Loading in the pretrained language model")
try:
    word_embeddings_file = Word2Vec.load_word2vec_format(EMBEDDING_FILE, binary=True)  # C binary format
except:
    print("\nThere was an error loading the pretrained vectors. All vectors will be random")
    print("To use pretrained word vectors please provide a vector file in location: {}\n".format(EMBEDDING_FILE))
    word_embeddings_file = None
    
print("Generating wordvectors for the entire vocabulary")
pretrained = 0
random_amount = 0
word_vectors = dict()
for word in vocabulary.keys():
    if word not in word_vectors:
        try:
            word_vectors[word] = word_embeddings_file[word]  # raw numpy vector of a word given it exists in the model
            pretrained += 1
        except:
            word_vectors[word] = np.random.uniform(-0.25,0.25,WORD_DIM) # Random numpy vector
            random_amount += 1

print ("""
Done!

Vocabulary Size: {vocab}
Word Vectors: {words}
Pretrained Vectors: {pre_vec} ({pre_perc}%)
Random Vectors: {ran_vec} ({ran_perc}%)

""".format(vocab=len(vocabulary),
           words=len(word_vectors),
           pre_vec=pretrained,
           pre_perc=float(pretrained) / len(word_vectors) * 100,
           ran_vec=random_amount,
           ran_perc=float(random_amount) / len(word_vectors) * 100
          ))

Creating the vocabulary from the files
Loading in the pretrained language model
Generating wordvectors for the entire vocabulary

Done!

Vocabulary Size: 30290
Word Vectors: 30290
Pretrained Vectors: 21917 (72.3572136018%)
Random Vectors: 8373 (27.6427863982%)




In [5]:
"""
Define the methods used to convert the other features to vectors
"""
def get_feature_vector(category, feature_index):
    onehot = np.zeros(len(FEATURES[feature_index]))
    # Assign one element in the vector to one, corresponding to the index
    # of the category in features
    try:
        onehot[FEATURES[feature_index].index(category)] = 1
    except:
        pass
    return onehot

def cap_feature(word):
    """
    Capitalization feature:
    0 = low caps
    1 = all caps
    2 = first letter caps
    3 = one capital (not first letter)
    """
    if word.lower() == word:
        return np.array([1, 0 ,0, 0])
    elif word.upper() == word:
        return np.array([0, 1 ,0, 0])
    elif word[0].upper() == word[0]:
        return np.array([0, 0 ,1, 0])
    else:
        return np.array([0, 0 ,0, 1])




In [6]:
def create_input_vectors(file_name):
    words = []
    features = []
    sentences = []
    sentence_features = []
    sentence_length = MAX_SENTENCE_LENGTH
    current_sentence_length = 0

    lines = codecs.open(file_name, encoding="utf-8").readlines()
    lines = [l for l in lines if "-DOCSTART-" not in l]

    for line in lines:
        if line in ['\n', '\r\n']:
            # end of line. Make sure all sentences are of equal length
            for _ in range(sentence_length - current_sentence_length):
                words.append(np.zeros(EMBEDDING_LENGTH))
                features.append(np.zeros(len(FEATURES[-1])))

            # Add current sentence words to sentences and refresh the lists
            sentences.append(words)
            sentence_features.append(features)
            words = []
            features = []
            current_sentence_length = 0
        else:
            # Make sure all lines have the right amount of features
            assert(len(line.split()) == FEATURE_AMOUNT + 1)

            # make sure no sentence is longer than max_sentence_length
            if current_sentence_length == sentence_length:
                sentences.append(words)
                sentence_features.append(features)
                words = []
                features = []
                current_sentence_length = 0

            # get the vector of the word in first position of each line
            word_and_features = line.split()
            temp = []
            temp = np.append(temp, word_vectors[word_and_features[0]])

            # get the feature vector for each feature of the word
            for index, feature in enumerate(word_and_features[1:-1]):
                temp = np.append(temp, get_feature_vector(feature, index))
            """
            Below are some additional features
            """

            temp = np.append(temp, cap_feature(word_and_features[0]))
            words.append(temp)

            # Add the tag to the tag list
            features.append(get_feature_vector(word_and_features[-1], len(FEATURES) - 1))


            current_sentence_length += 1


    # Check there are features for each sentence
    assert(len(sentences) == len(sentence_features))
    return np.asarray(sentences), np.array(sentence_features)

print("Creating input and target vectors for the dataset")
pool = Pool(processes=3)
train_process = pool.apply_async(create_input_vectors, args=(TRAINING_FILE,))
dev_process = pool.apply_async(create_input_vectors, args=(DEV_FILE,))
val_process = pool.apply_async(create_input_vectors, args=(VALIDATION_FILE,))

train_input, train_target = train_process.get()
dev_input, dev_target = dev_process.get()
validation_input, validation_target = val_process.get()


for name, data_file, target_file in [("Train", train_input, train_target), 
                                     ("Development", dev_input, dev_target),
                                     ("Validation", validation_input, validation_target)]:
    
    print("""
    {name} Data

    Train Sentences: {input_amount}
    Sentence Length: {input_len}
    Embedding Length: {emb_size}
    Input Shape: ({input_len}, {emb_size})

    Number of Targets: {target_amount}
    Target Sentences: {target_len}
    Target Length: {target_size}
    Target Shape: ({target_len}, {target_size})
    """.format(name=name,
               input_amount=len(data_file),
               input_len=len(data_file[0]),
               emb_size=len(data_file[1][2]),
               target_amount=len(target_file),
               target_len=len(target_file[0]),
               target_size=len(target_file[0][0])))

Creating input and target vectors for the dataset

    Train Data

    Train Sentences: 16779
    Sentence Length: 30
    Embedding Length: 358
    Input Shape: (30, 358)

    Number of Targets: 16779
    Target Sentences: 30
    Target Length: 8
    Target Shape: (30, 8)
    

    Development Data

    Train Sentences: 3972
    Sentence Length: 30
    Embedding Length: 358
    Input Shape: (30, 358)

    Number of Targets: 3972
    Target Sentences: 30
    Target Length: 8
    Target Shape: (30, 8)
    

    Validation Data

    Train Sentences: 4072
    Sentence Length: 30
    Embedding Length: 358
    Input Shape: (30, 358)

    Number of Targets: 4072
    Target Sentences: 30
    Target Length: 8
    Target Shape: (30, 8)
    


In [7]:
print(train_target[4])

[[ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0

In [1]:
"""
Create functions used to keep track of the training and get insight into how the performance is developing
"""

                              
def generate_confusion_matrix(prediction, target, epoch):
    def generate_prediction_matrix(predictions, targets):
        matrix = np.zeros((len(TARGET_VECTORS), len(TARGET_VECTORS)), dtype=np.float32)
        order = {}

        target_names = TARGET_VECTORS.values()
        target_names.sort()
        for sent_index, sentence in enumerate(predictions):
            for index, pred in enumerate(sentence):
                try:
                    predicted_target = TARGET_VECTORS[str(pred)]
                    true_target = TARGET_VECTORS[str(targets[sent_index][index])]

                    row = target_names.index(predicted_target)
                    col = target_names.index(true_target)
                    matrix[row][col] = 1
                except:
                    pass
        
        target_names_with_count = []
        for i in range(len(matrix)):
            row_sum = sum(matrix[i])
            target_names_with_count.append("{} ({})".format(target_names[i], int(row_sum)))
            for j in range(len(matrix[i])):
                matrix[i][j] = matrix[i][j] / row_sum if row_sum != 0 else 0
            
        return matrix, target_names_with_count, target_names
    
    matrix, rows, columns = generate_prediction_matrix(prediction, target)
    df_cm = pd.DataFrame(matrix, 
                         index = [i for i in rows],
                         columns = [i for i in columns])

    title = "Epoch: {}".format(epoch)
    plt.figure(figsize = (10,6))
    plt.suptitle(title, fontsize=14, fontweight='bold')
    confusion = sn.heatmap(df_cm, annot=True)
    display.display(plt.gcf())
    display.clear_output(wait=True)
    plt.savefig("confusion_matrix.png")    

def cost(prediction, target):
    target = tf.reshape(target, [-1, MAX_SENTENCE_LENGTH, NUM_TARGETS])
    prediction = tf.reshape(prediction, [-1, MAX_SENTENCE_LENGTH, NUM_TARGETS])
    
    cross_entropy = target * tf.log(prediction)
    cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
    
    mask = tf.sign(tf.reduce_max(tf.abs(target), reduction_indices=2))
    
    cross_entropy *= mask
    cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
    cross_entropy /= tf.cast(length(target), tf.float32)
    return tf.reduce_mean(cross_entropy)

def length(target):
    used = tf.sign(tf.reduce_max(tf.abs(target), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

def f1(prediction,target): # not tensors but result values
    # Reshape the big arrays into smaller sizes with shape (MAX_SENTENCE_LENGTH, NUM_TARGETS)
    target = np.reshape(target, (-1, MAX_SENTENCE_LENGTH, NUM_TARGETS))
    prediction = np.reshape(prediction, (-1, MAX_SENTENCE_LENGTH, NUM_TARGETS))
    
   
    
    true_positive=np.asarray([0]*(NUM_TARGETS+2))
    false_positive=np.asarray([0]*(NUM_TARGETS+2))
    false_negative=np.asarray([0]*(NUM_TARGETS+2))

    target = np.argmax(target, 2)
    prediction = np.argmax(prediction, 2)


    for i in range(len(target)):
        for j in range(MAX_SENTENCE_LENGTH):
            if target[i][j] == prediction[i][j]:
                true_positive[target[i][j]] += 1
            else:
                false_positive[target[i][j]] += 1
                false_negative[prediction[i][j]] += 1

    NON_NAMED_ENTITY = TARGET_LOCATION['O']
    for i in range(NUM_TARGETS):
        if i != NON_NAMED_ENTITY:
            true_positive[-2] += true_positive[i]
            false_positive[-2] += false_positive[i]
            false_negative[-2] += false_negative[i]
        else:
            true_positive[-1] += true_positive[i]
            false_positive[-1] += false_positive[i]
            false_negative[-1] += false_negative[i]

    precision = []
    recall = []
    fscore = []
    for i in range(NUM_TARGETS+2):
        division_point = true_positive[i]+false_positive[i]
        if true_positive[i]+false_positive[i] != 0:
            precision.append(true_positive[i]*1.0/true_positive[i]+false_positive[i])
        else:
            precision.append(0.0)
           
        
        if true_positive[i]+false_negative[i] != 0:
            recall.append(true_positive[i]*1.0/true_positive[i]+false_negative[i])
        else:
            recall.append(0.0)
            
        if precision[i]+recall[i] != 0:
            fscore.append(2.0*precision[i]*recall[i]/(precision[i]+recall[i]))
        else:
            fscore.append(0.0)

    print("precision = " ,precision)
    print("recall = " ,recall)
    print("f1score = " ,fscore)
    efs = fscore[-2]
    print("Entity fscore :", efs )
    return efs

In [9]:
"""
Define Model Parameters
"""
# resetting the graph
reset_default_graph()

NUM_HIDDEN = 100
NUM_LAYERS = 3
DROPOUT = 0.5
LEARNING_RATE = 0.003

print("Network building")

net = tflearn.input_data([None, MAX_SENTENCE_LENGTH, EMBEDDING_LENGTH])
net = rnn.bidirectional_rnn(MultiRNNCell([GRUCell(NUM_HIDDEN)]*NUM_LAYERS), MultiRNNCell([GRUCell(NUM_HIDDEN)]*NUM_LAYERS), tf.unpack(tf.transpose(net, perm=[1, 0, 2])), dtype=tf.float32)  #256=num_hidden, 3=num_layers
net = tflearn.dropout(net[0], DROPOUT)
net = tf.transpose(tf.pack(net), perm=[1, 0, 2])

net = tflearn.fully_connected(net, MAX_SENTENCE_LENGTH*NUM_TARGETS, activation='softmax')
net = tflearn.regression(net, optimizer='adam',loss=cost)

model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0)
print("Network Built")

Network building
Network Built


In [10]:
train_target = np.asarray(train_target).astype(int).reshape(len(train_target),-1)
print(train_target.shape)

dev_target = np.asarray(dev_target).astype(int).reshape(len(dev_target),-1)
print(dev_target.shape)


validation_target = np.asarray(validation_target).astype(int).reshape(len(validation_target),-1)
print(validation_target.shape)

(16779, 240)
(3972, 240)
(4072, 240)


In [12]:
"""
Define Training Parameters
"""
BATCH_SIZE = 256
BATCH_AMOUNT = (len(train_input) + BATCH_SIZE - 1) / BATCH_SIZE
NUM_EPOCH = 200

print("Batches pr. Epoch: {}\n".format(BATCH_AMOUNT))

max_entity = 0
max_batch = 0
batch = 0
reached_max = 0

while True:
    with ops.get_default_graph().as_default():
        model.fit(train_input, train_target,n_epoch=1, validation_set=(dev_input,dev_target), show_metric=False, batch_size=BATCH_SIZE)
        batch += 75
        val_pred =np.asarray(model.predict(validation_input))
        entity_f1 = f1(val_pred, validation_target)
        former_max = max_entity

        if entity_f1 >= max_entity:
            max_entity = entity_f1
            max_batch = batch
            reached_max = 0

        if entity_f1 - former_max < 1:
            reached_max += 1

        print("max entity f1: {}, batch nr: {}, early stop counter: {}".format(max_entity, max_batch, reached_max))
        if reached_max == 10:
            break



Training Step: 505  | total loss: [1m[32mnan[0m[0m
[2K| Adam | epoch: 000 | loss: nan -- iter: 11008/16779


KeyboardInterrupt: 