In [None]:
import numpy as np
import pandas as pd
import json
from future.utils import iteritems
from sklearn.model_selection import train_test_split
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.layers.merge import add
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import keras
from seqeval.metrics import f1_score, classification_report,accuracy_score
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K
from keras.models import model_from_json, model_from_yaml
from keras.models import load_model


### Helper functions

In [0]:
class SentenceGetterCoNLL(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["word"].values.tolist(),
                                                           s["pos"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

class SentenceGetterGMB(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

def read_gmb_data():
    data = pd.read_csv("ner_dataset.csv", encoding="latin1")
    data = data.fillna(method="ffill")
    words = list(set(data["Word"].values))

    words.append("EOL")
    n_words = len(words)
    tags = list(set(data["Tag"].values))
    n_tags = len(tags)

    getter = SentenceGetterGMB(data)
    sentences = getter.sentences
    # GMB MAX LEN
    max_len = 81
    tag2idx = {t: idx for idx, t in enumerate(tags)}

    X = [[word[0] for word in s] for s in sentences]
    x_tmp = []
    for seq in X:
        new_seq = []
        for idx in range(max_len):
            try:
                new_seq.append(seq[idx])
            except:
                new_seq.append("PAD")
        x_tmp.append(new_seq)
    X = n_tmp

    y = [[tag2idx[word[2]] for word in s] for s in sentences]

    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

    return np.array(X), np.array(y), tags, tag2idx, n_tags, max_len



def read_conll_data(data_type = "train", tags_input = None, tag2idx_input = None):
    if data_type == "train":
        data = pd.read_csv("parsed_train.csv")

    data = data.fillna(method="ffill")
    words = list(set(data["word"].values))

    words.append("EOL")
    n_words = len(words)

    
    if data_type == "train":    
        tags = list(set(data["tag"].values))
    else:
        tags = tags_input
    n_tags = len(tags)

    getter = SentenceGetterCoNLL(data)

    sentences = getter.sentences
    # CONLL MAX LEN
    max_len = 140
    if data_type == "train":
        tag2idx = {t: idx for idx, t in enumerate(tags)}
    else:
        tags2idx = tag2idx_input


    X = [[word[0] for word in sent] for sent in sentences]
    x_tmp = []
    for seq in X:
        new_seq = []
        for idx in range(max_len):
            try:
                new_seq.append(seq[idx])
            except:
                new_seq.append("PAD")
        x_tmp.append(new_seq)
    X = x_tmp

    y = [[tag2idx[word[2]] for word in sent] for sent in sentences]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

    return np.array(X), np.array(y), tags, tag2idx, n_tags, max_len

def read_conll_data_test(tags, tag2idx):
    
    data = pd.read_csv("parsed_testa.csv")
    data = data.fillna(method="ffill")
    
    words = list(set(data["word"].values))

    words.append("EOL")
    n_words = len(words)
    n_tags = len(tags)

    getter = SentenceGetterCoNLL(data)

    sentences = getter.sentences
    # CONLL MAX LEN
    max_len = 140

    X = [[word[0] for word in sent] for sent in sentences]
    x_tmp = []
    for seq in X:
        new_seq = []
        for idx in range(max_len):
            try:
                new_seq.append(seq[idx])
            except:
                new_seq.append("PAD")
        new_X.append(new_seq)
    X = new_X

    y = [[tag2idx[word[2]] for word in sent] for sent in sentences]

    
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

    return np.array(X), np.array(y)


### Model preparation

In [None]:
def ElmoEmbedding(x):
    elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_len])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

def prepare_model(max_len, n_tags, lstm_units = 512, dropout=0.2, recurrent_dropout=0.2):
    input_layer = Input(shape=(max_len,), dtype=tf.string)
    
    embedding = Lambda(ElmoEmbedding, output_shape=(None, 1024))(input_layer)
    
    bi_dir = Bidirectional(LSTM(units=lstm_units, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(embedding)
    
    bi_dir_2 = Bidirectional(LSTM(units=lstm_units, return_sequences=True,
                               recurrent_dropout=recurrent_dropout, dropout=dropout))(bi_dir)
    x = add([x, bi_dir_2])  
    output_layer = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

    model = Model(input_layer, output_layer)

    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

    return model

elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

### Execute one of the two following cells to read CONLL or GMB data

In [None]:
# READ CONLL DATA
X_tr, y_tr, tags, tag2idx, n_tags, max_len = read_conll_data(data_type = "train")
X_te, y_te  = read_conll_data_test(tags, tag2idx)

In [0]:
# READ GMB DATA
# X_tr, y_tr, tags, tag2idx, n_tags, max_len = read_gmb_data()
# X_tr, X_te, y_tr, y_te = train_test_split(X_tr, y_tr, test_size=0.1, random_state=2018)

### Load the ELMo embeddings from Tensorflow hub.

In [249]:
sess = tf.Session()
K.set_session(sess)

batch_size = 32
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


### Execute one of the two following cells to define the Train/Test dataset in order to fit the model and the batch size. 

In [None]:
# GMB --> THIS WORKS OK

# X_tr, X_val = X_tr[:1213*batch_size], X_tr[-135*batch_size:]
# y_tr, y_val = y_tr[:1213*batch_size], y_tr[-135*batch_size:]
# y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
# y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

In [255]:
# CONLL ---> THIS WORKS OK
X_tr = X_tr[:14944]
y_tr = y_tr[:14944]
X_tr, X_val, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.14346895074, random_state=2018)
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)


((12800, 140), (12800, 140, 1), (2144, 140), (2144, 140, 1))

In [256]:
model = prepare_model(max_len, n_tags, lstm_units = 512, dropout = 0.2)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [252]:
model.summary()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 140)          0                                            
__________________________________________________________________________________________________
lambda_12 (Lambda)              (None, None, 1024)   0           input_12[0][0]                   
__________________________________________________________________________________________________
bidirectional_20 (Bidirectional (None, None, 1024)   6295552     lambda_12[0][0]                  
__________________________________________________________________________________________________
bidirectional_21 (Bidirectional (None, None, 1024)   6295552     bidirectional_20[0][0]           
___________________________________________________________________________________________

In [257]:
history = model.fit(np.array(X_tr), y_tr, validation_data=(np.array(X_val), y_val),
                    batch_size=batch_size, epochs=5, verbose=1)


Train on 12800 samples, validate on 2144 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Execute one of the following cells to test the model on GMB or CONLL dataset

In [265]:
# TEST CONLL DATASET
y_pred = model.predict(np.array(X_te[:3456]), verbose=1)   



In [0]:
# CONLL cont.
pred_converted = y_pred
pred_converted = np.argmax(pred_converted, axis=-1)
y_te_converted = np.array(y_te[:3456], dtype = np.int64)

tag_eval = list(tag2idx.values())
tag_eval.remove(5)

In [0]:
# TEST GMB DATASET
# y_pred = model.predict(np.array(X_te[:4768]), verbose=1) 
# pred_converted = y_pred
# pred_converted = np.argmax(pred_converted, axis=-1)
# y_te_converted = np.array(y_te[:4768], dtype = np.int64)


# tag_eval = tag2idx.copy()
# tag_eval.pop('O')
# tag_eval = tag_eval.values()

### Evaluate the results with F1-score. Print the classification report

In [369]:
# EVAL
from sklearn_crfsuite.metrics import flat_classification_report
report = flat_classification_report(y_pred=pred_converted, y_true=y_te_converted, labels = tag_eval)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      0.93      0.95      2093
           1       0.98      0.98      0.98      3145
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.88      0.95      0.91      2078
           6       0.90      0.90      0.90      1263
           7       0.00      0.00      0.00         4

   micro avg       0.94      0.95      0.94      8583
   macro avg       0.53      0.54      0.53      8583
weighted avg       0.94      0.95      0.94      8583



### Save the model and the weights

In [305]:
model_yaml = model.to_yaml()
with open("elmo_conll_1.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)

# serialize weights to HDF5
model.save_weights("elmo_conll_1.h5")
print("Saved model w/ ELMO to disk!")

Saved model w/ ELMO to disk!


In [368]:
# TRY TO LOAD THE MODEL 

yaml_file = open('elmo_conll_1.yaml', 'r')
loaded_model_yaml = yaml_file.read()
yaml_file.close()
loaded_model = model_from_yaml(loaded_model_yaml, custom_objects={"elmo_model": elmo_model, "tf": tf, "hub":hub, "batch_size": batch_size, "max_len": max_len})
# load weights into new model
loaded_model.load_weights("elmo_conll_1.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

sent = "As Harry rides the Hogwarts Express on his journey back from school after his fourth year and the dire Triwizard Tournament, he dreads disembarking from the train"

sentences_to_predict = []
words_to_predict = sent.split()
input_pad = ["PAD"]
len_max = 140
words_to_predict.extend(input_pad * (len_max - len(words_to_predict)))

sentences_to_predict.append(words_to_predict)

# This is necessary because the model expects input with fixed length and batch size
for idx in range(0, 31):
    sentences_to_predict.append(["PAD"] * len_max)

sentences_to_predict = np.array(sentences_to_predict)

predicted_result = loaded_model.predict(sentences_to_predict)

sentence_result = predicted_result[0]
sentence_result = np.argmax(sentence_result, axis=-1)
indices_to_tag_gmb = {0: 'B-geo', 1: 'B-tim', 2: 'I-gpe', 3: 'I-art', 4: 'B-per', 5: 'I-eve', 6: 'B-gpe', 7: 'I-geo', 8: 'B-eve', 9: 'I-nat', 10: 'B-nat', 11: 'I-org', 12: 'I-tim', 13: 'I-per', 14: 'B-org', 15: 'B-art', 16: 'O'}
indices_to_tag_conll = {5: 'O', 1: 'I-PER', 4: 'I-ORG', 6: 'I-MISC', 0: 'I-LOC', 3: 'B-ORG', 7: 'B-MISC', 2: 'B-LOC'}


print("{:15} {:5}".format("Word", "Pred"))
print("="*25)
for word, res in zip(words_to_predict, sentence_result):
    if word != "PAD":
        print("{:15}:{:5}".format(word, indices_to_tag_conll[res]))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Loaded model from disk
Word            Pred 
As             :O    
Harry          :I-PER
rides          :O    
the            :O    
Hogwarts       :I-ORG
Express        :I-ORG
on             :O    
his            :O    
journey        :O    
back           :O    
from           :O    
school         :O    
after          :O    
his            :O    
fourth         :O    
year           :O    
and            :O    
the            :O    
dire           :O    
Triwizard      :I-MISC
Tournament,    :I-MISC
he             :O    
dreads         :O    
disembarking   :O    
from           :O    
the            :O    
train          :O    
