Reference: https://gist.github.com/dirko/1d596ca757a541da96ac3caa6f291229

In [2]:
import pickle 
import numpy as np 

# from sklearn.cross_validation import train_test_split
# from lambdawithmask import Lambda as MaskLambda
from sklearn.metrics import confusion_matrix, accuracy_score

from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

# from keras.layers.recurrent import LSTM
# from keras.layers.core import Activation, Dense, Input
# from keras.layers.embeddings import Embedding
# from keras.layers.wrappers import TimeDistributed, Bidirectional

from keras.layers import Input, Dense, TimeDistributed
from keras.layers import Embedding, Activation
from keras.layers import GRU, LSTM, Bidirectional
from keras.callbacks import ModelCheckpoint, EarlyStopping


from keras.backend import tf

Using TensorFlow backend.


## Load the data

In [5]:
with open('conll.pkl', 'rb') as f:
    data = pickle.load(f)

In [6]:
X = data['X']
y = data['y']
word2ind = data['word2ind']
ind2word = data['ind2word']
label2ind = data['label2ind']
ind2label = data['ind2label']

In [7]:
print(len(X))
print(len(X[0]))
print(X[0])

print(len(y))
print(len(y[0]))
print(y[0])

print(label2ind)
print(ind2label)

3640
15
['010', 'is', 'the', 'tenth', 'album', 'from', 'Japanese', 'Punk', 'Techno', 'band', 'The', 'Mad', 'Capsule', 'Markets', '.']
3640
15
['I-MISC', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'O', 'O', 'O', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O']
{'I-PER': 1, 'I-LOC': 2, 'I-ORG': 4, 'I-MISC': 5, 'O': 3}
{1: 'I-PER', 2: 'I-LOC', 3: 'O', 4: 'I-ORG', 5: 'I-MISC'}


In [8]:
def encode(x, n):
    result = np.zeros(n)
    result[x] = 1
    return result

In [9]:
maxlen = max([len(x) for x in X])
print('Maximum sequence length:', maxlen)

X_enc = [[word2ind[c] for c in x] for x in X]
# X_enc_reverse = [[c for c in reversed(x)] for x in X_enc]
X_enc = pad_sequences(X_enc, maxlen=maxlen)
# X_enc_b = pad_sequences(X_enc_reverse, maxlen=maxlen)

Maximum sequence length: 63


In [10]:
print(type(X_enc))
print(X_enc.shape)

<class 'numpy.ndarray'>
(3640, 63)


In [11]:
max_label = max(label2ind.values()) + 1
print(max_label)

y_enc = [[0] * (maxlen - len(ey)) + [label2ind[c] for c in ey] for ey in y]
print(y_enc[0])
y_enc = [[encode(c, max_label) for c in ey] for ey in y_enc]
print(len(y_enc[0]))
y_enc = pad_sequences(y_enc, maxlen=maxlen)
print(type(y_enc))
print(y_enc.shape)

# (X_train_f, X_test_f, X_train_b,
#  X_test_b, y_train, y_test) = train_test_split(X_enc_f, X_enc_b, y_enc,
#                                                test_size=11*32, train_size=45*32, random_state=42)

6
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 3, 3, 3, 3, 5, 3, 3, 3, 4, 4, 4, 4, 3]
63
<class 'numpy.ndarray'>
(3640, 63, 6)


In [12]:
validation_split = 0.1
test_split = 0.1 

indices = np.arange(X_enc.shape[0])
np.random.shuffle(indices)
X_enc = X_enc[indices]
y_enc = y_enc[indices]
num_validation_samples = int(validation_split * X_enc.shape[0])
num_test_samples = int(test_split * X_enc.shape[0])

X_train = X_enc[:-num_validation_samples-num_test_samples]
y_train = y_enc[:-num_validation_samples-num_test_samples]
X_val = X_enc[-num_validation_samples-num_test_samples:]
y_val = y_enc[-num_validation_samples-num_test_samples:]
X_test = X_enc[-num_test_samples:]
y_test = y_enc[-num_test_samples:]

In [13]:
print('Training and testing tensor shapes:')
print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)

Training and testing tensor shapes:
(2912, 63) (728, 63) (364, 63) (2912, 63, 6) (728, 63, 6) (364, 63, 6)


## Build the model 

In [15]:
max_features = len(word2ind)
embedding_size = 128
hidden_size = 32
out_size = len(label2ind) + 1
batch_size = 32
epochs = 30

In [3]:
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=embedding_size,
                    input_length=maxlen, mask_zero=True))
model.add(Bidirectional(LSTM(hidden_size, return_sequences=True)))
model.add(TimeDistributed(Dense(out_size)))
model.add(Activation('softmax'))

model.summary()

NameError: name 'max_features' is not defined

## Train the model 

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [19]:
filepath = "models/NER-Wikigold-{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
earlystopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto')
callbacks_list = [checkpoint, earlystopping]

In [20]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_data=(X_val, y_val), callbacks=callbacks_list)

Train on 2912 samples, validate on 728 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 00017: early stopping


<keras.callbacks.History at 0x11c2d1128>

## Evaluate the model

In [3]:
model = load_model('models/NER-Wikigold-09-0.08.hdf5')

In [16]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('Raw test score:', score)



In [17]:
def score(yh, pr):
    coords = [np.where(yhh > 0)[0][0] for yhh in yh]
    yh = [yhh[co:] for yhh, co in zip(yh, coords)]
    ypr = [prr[co:] for prr, co in zip(pr, coords)]
    fyh = [c for row in yh for c in row]
    fpr = [c for row in ypr for c in row]
    return fyh, fpr

In [18]:
# On the training set ]

pr = model.predict(X_train)
pr = pr.argmax(2)
print(pr.shape)
print(pr[1])
print(pr[0][0])
yh = y_train.argmax(2)
print(yh.shape)
print(yh[1])

(2912, 63)
[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 3 3 3 3 3]
3
(2912, 63)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 3 4 3 3 3 3 3]


In [19]:
fyh, fpr = score(yh, pr)
print('Testing accuracy:', accuracy_score(fyh, fpr))
print('Testing confusion matrix:')
print(confusion_matrix(fyh, fpr))

Testing accuracy: 0.995099418884
Testing confusion matrix:
[[ 2513     0    23     4     0]
 [    2  2195    49    22     3]
 [    5    10 50263    20    23]
 [    2     3    38  3015     4]
 [    8     4    64    12  2119]]


In [20]:
# On the validatiotn set

pr = model.predict(X_val)
pr = pr.argmax(2)
print(pr.shape)
print(pr[0])
print(pr[0][0])
yh = y_val.argmax(2)
print(yh.shape)
print(yh[0])

(728, 63)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3]
1
(728, 63)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 3 3 3 3 3 3]


In [21]:
fyh, fpr = score(yh, pr)
print('Testing accuracy:', accuracy_score(fyh, fpr))
print('Testing confusion matrix:')
print(confusion_matrix(fyh, fpr))

Testing accuracy: 0.997699127022
Testing confusion matrix:
[[  517     0     1     0     0]
 [    0   529     3     0     3]
 [    1     0 12513     0     1]
 [    0     3    10   773     2]
 [    0     0     8     2   411]]


In [23]:
# On the test set 
pr = model.predict(X_test)
pr = pr.argmax(2)
print(pr.shape)
print(pr[0])
print(pr[0][0])
yh = y_test.argmax(2)
print(yh.shape)
print(yh[0])

(364, 63)
[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
3
(364, 63)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]


In [24]:
fyh, fpr = score(yh, pr)
print('Testing accuracy:', accuracy_score(fyh, fpr))
print('Testing confusion matrix:')
print(confusion_matrix(fyh, fpr))

Testing accuracy: 0.998226466576
Testing confusion matrix:
[[ 228    0    0    0    0]
 [   0  284    0    0    2]
 [   1    0 6232    0    1]
 [   0    0    3  380    0]
 [   0    0    6    0  193]]
