In [70]:
import numpy as np
import pandas as pd
import json
from future.utils import iteritems
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.optimizers import Adam
from keras_contrib.layers import CRF
from keras.initializers import he_normal
import keras
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [2]:
def get_sentence_dict():
    sentence_dict = {}
    with open('sentence_dict.json', 'r') as fp:
        sentence_dict = json.load(fp)
    return sentence_dict

In [3]:
def get_input_dict(lemma=False):
    sentence_dict = get_sentence_dict()
    input_dict = {}
    if lemma:
        idx = 2
    else:
        idx = 0
    for key, sentence in sentence_dict.items():
        new_sentence = []
        for word in sentence:
            new_sentence.append((word[idx], word[1], word[3]))
        input_dict[int(float(key))] = new_sentence
    return input_dict

In [84]:
def convert_predictions(y_pred):
    conv_pred = []
    for pred in y_pred:
        conv_tmp = []
        for val in pred:
            val_arg_max = np.argmax(val)
            conv_tmp.append(indices_to_tag[val_arg_max])
        conv_pred.append(conv_tmp)
    return conv_pred

In [4]:
df = pd.read_csv("ner_first_preprocessing.csv")
df = df[['sentence_idx', 'word', 'lemma','pos', 'tag']]

### Create the appropriate lists and dictionaries that will be used to create the input and output training/test sets 

In [30]:
input_dict = get_input_dict()
# List with possible tags
tags = ['B-geo', 'B-tim', 'I-gpe', 'I-art', 'B-per', 'I-eve', 'B-gpe', 'I-geo', 'B-eve', 'I-nat', 'B-nat', 'I-org', 'I-tim', 'I-per', 'B-org', 'B-art', 'O']
tags_without_O = tags[:-1]
# List with unique words in the dataset
words = list(set(df["word"].values))
words.append("EOL")
# Number of unique words 
number_of_words = len(words)
# Number of tags
number_of_tags = len(tags)

# The words and tags are converted to appropriate numerical representation.
word_indices = {w: idx for idx, w in enumerate(words)}
tag_indices = {t: idx for idx, t in enumerate(tags)}
indices_to_tag = {v: k for k, v in iteritems(tag_indices)}

# The input dictionary that containes the preprocessed dataset


In [31]:
# Length of the longest sentence in the dataset
len_max = max([len(s) for s in input_dict.values()])
print(len_max)

81


### Create the input/output sequences. Use padding, because all of the sentences need to have the same length (Bi-LSTM requirement)

In [32]:
input_final = [[word_indices[word[0]] for word in s] for s in list(input_dict.values())]
output_final = [[tag_indices[t[2]] for t in s] for s in list(input_dict.values())]

input_pad = word_indices["EOL"]
input_final = pad_sequences(maxlen=len_max, sequences=input_final, padding="post", value=input_pad)
output_final = pad_sequences(maxlen=len_max, sequences=output_final, padding="post", value=tag_indices["O"])
output_final = [to_categorical(tag, num_classes=len(tags)) for tag in output_final]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(input_final, output_final, test_size=0.2)

### Create the model

In [79]:
# Some of the parameters:
embedding_size = 20
# lstm_units = embedding_size * 2
lstm_units = 40 
dropout = 0.5
recurrent_dropout = 0.5

# Input
input_layer = Input(shape=(len_max,))

# Embedding Layer
model = Embedding(input_dim=number_of_words, output_dim=embedding_size, input_length=len_max)(input_layer)
# BI-LSTM Layer
model = Bidirectional(LSTM(units=lstm_units, return_sequences=True, 
                           dropout=dropout, recurrent_dropout=recurrent_dropout, 
                           kernel_initializer=keras.initializers.he_normal()))(model)
# TimeDistributed layer
model = TimeDistributed(Dense(number_of_tags, activation="relu"))(model)  
# CRF Layer
crf = CRF(number_of_tags)

# Output 
output_layer = crf(model) 
model = Model(input_layer, output_layer)

# Optimiser 
adam = Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])
model.summary()



Model: "model_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_28 (InputLayer)        (None, 81)                0         
_________________________________________________________________
embedding_28 (Embedding)     (None, 81, 20)            603460    
_________________________________________________________________
bidirectional_26 (Bidirectio (None, 81, 80)            19520     
_________________________________________________________________
time_distributed_24 (TimeDis (None, 81, 17)            1377      
_________________________________________________________________
crf_22 (CRF)                 (None, 81, 17)            629       
Total params: 624,986
Trainable params: 624,986
Non-trainable params: 0
_________________________________________________________________


In [63]:
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=5, validation_split=0.1, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25326 samples, validate on 2815 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
y_pred = model.predict(X_test, verbose=1)   

In [81]:
pred_labels = convert_predictions(y_pred)
real_labels = convert_predictions(y_test)

In [85]:
print("F1-score: {:.2%}".format(f1_score(real_labels, pred_labels)))

F1-score: 77.54%


In [83]:
print(classification_report(real_labels, pred_labels))

           precision    recall  f1-score   support

      gpe       0.94      0.90      0.92      2399
      per       0.74      0.66      0.70      2440
      geo       0.79      0.85      0.82      5457
      org       0.65      0.60      0.63      2966
      nat       0.00      0.00      0.00        26
      tim       0.85      0.75      0.80      3008
      art       0.00      0.00      0.00        72
      eve       0.00      0.00      0.00        54

micro avg       0.79      0.76      0.78     16422
macro avg       0.78      0.76      0.77     16422



In [74]:
from sklearn_crfsuite.metrics import flat_classification_report  
report = flat_classification_report(y_pred=pred_labels, y_true=real_labels, labels = tags_without_O)
print(report)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-geo       0.81      0.86      0.84      5457
       B-tim       0.89      0.79      0.84      3008
       I-gpe       0.00      0.00      0.00        27
       I-art       0.00      0.00      0.00        49
       B-per       0.84      0.75      0.79      2440
       I-eve       0.00      0.00      0.00        49
       B-gpe       0.95      0.91      0.93      2399
       I-geo       0.78      0.62      0.69      1071
       B-eve       0.00      0.00      0.00        54
       I-nat       0.00      0.00      0.00        11
       B-nat       0.00      0.00      0.00        26
       I-org       0.74      0.71      0.73      2399
       I-tim       0.90      0.45      0.60       903
       I-per       0.85      0.80      0.82      2490
       B-org       0.71      0.63      0.67      2966
       B-art       0.00      0.00      0.00        72

   micro avg       0.82      0.76      0.79     23421
   macro avg       0.47   

In [86]:
import seqeval

In [None]:
seqeval