In [1]:
import numpy as np
import pickle

from sklearn.metrics import confusion_matrix, accuracy_score

from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, GRU, LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D, MaxPooling1D

from keras.callbacks import ModelCheckpoint, EarlyStopping

Using TensorFlow backend.


## Load the data

In [2]:
### Load Data
with open('atis.pkl', 'rb') as f:
    train_set, valid_set, test_set, dicts = pickle.load(f)

w2idx, ne2idx, labels2idx = dicts['words2idx'], dicts['tables2idx'], dicts['labels2idx']

# Create index to word/label dicts
idx2w  = {w2idx[k]:k for k in w2idx}
idx2ne = {ne2idx[k]:k for k in ne2idx}
idx2la = {labels2idx[k]:k for k in labels2idx}

In [31]:
print(len(train_set), len(train_set[0]))
print(len(labels2idx))
print(sorted(list(labels2idx.values())))
print(idx2ne[0])

3 19915
127
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126]
<NOTABLE>


In [4]:
train_x, train_ne, train_label = train_set
val_x, val_ne, val_label = valid_set
test_x, test_ne, test_label = test_set

X = train_x + val_x + test_x
ne = train_ne + val_ne + test_ne 
label = train_label + val_label + test_label 

# words_test = [ list(map(lambda x: idx2w[x], w)) for w in test_x]
# groundtruth_test = [ list(map(lambda x: idx2la[x], y)) for y in test_label]
# words_val = [ list(map(lambda x: idx2w[x], w)) for w in val_x]
# groundtruth_val = [ list(map(lambda x: idx2la[x], y)) for y in val_label]
# words_train = [ list(map(lambda x: idx2w[x], w)) for w in train_x]
# groundtruth_train = [ list(map(lambda x: idx2la[x], y)) for y in train_label]

In [17]:
print(type(X), type(label))
print(X[0])
print(label[0])
for i in range(5):
    print(len(X[i]))
    print(X[i])
print()
for i in range(5):
    print(len(label[i]))
    print(label[i])

<class 'list'> <class 'list'>
[554 194 268  64  62  16   8 234 481  20  40  58 234 415 205]
[126 126 126  48 126  36  35 126 126  33 126 126 126  78 123]
15
[554 194 268  64  62  16   8 234 481  20  40  58 234 415 205]
8
[554 241 481  14 200  91  26 239]
12
[232   0 273 502 254 481 165 193 208  77 502  64]
10
[439 301 481 532  22 194 208  64 502  77]
10
[439 301 481  99 410 516 208 128 502  69]

15
[126 126 126  48 126  36  35 126 126  33 126 126 126  78 123]
8
[126 126 126 126 126   2  83  83]
12
[126 126 126 126 126 126  42 126 126  48 126  78]
10
[126 126 126   2  83 126 126  48 126  78]
10
[126 126 126  21  66 117 126  48 126  78]


In [7]:
maxlen = max([len(x) for x in X])
print('Maximum sequence length:', maxlen)

Maximum sequence length: 46


In [8]:
def encode(x, n):
    result = np.zeros(n)
    result[x] = 1
    return result

In [11]:
X_enc = pad_sequences(X, maxlen=maxlen)

In [16]:
print(type(X_enc))
print(X_enc.shape)
print(X_enc[:2])

<class 'numpy.ndarray'>
(29355, 46)
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0 554 194 268  64  62
   16   8 234 481  20  40  58 234 415 205]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0 554 241 481  14 200  91  26 239]]


In [33]:
max_label = max(labels2idx.values()) + 1
print(max_label)

y_enc = [[0] * (maxlen - len(ey)) + [ey] for ey in label]
y_enc = [[encode(c, max_label) for c in ey] for ey in y_enc]
y_enc = pad_sequences(y_enc, maxlen=maxlen)

127


In [None]:
print(y_enc[0])
print(len(y_enc[0]))
print(type(y_enc))
print(y_enc.shape)

In [19]:
validation_split = 0.1
test_split = 0.1 

indices = np.arange(X_enc.shape[0])
np.random.shuffle(indices)
X_enc = X_enc[indices]
y_enc = y_enc[indices]
num_validation_samples = int(validation_split * X_enc.shape[0])
num_test_samples = int(test_split * X_enc.shape[0])

X_train = X_enc[:-num_validation_samples-num_test_samples]
y_train = y_enc[:-num_validation_samples-num_test_samples]
X_val = X_enc[-num_validation_samples-num_test_samples:]
y_val = y_enc[-num_validation_samples-num_test_samples:]
X_test = X_enc[-num_test_samples:]
y_test = y_enc[-num_test_samples:]

In [20]:
print('Training and testing tensor shapes:')
print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)

Training and testing tensor shapes:
(23485, 46) (5870, 46) (2935, 46) (23485, 46, 127) (5870, 46, 127) (2935, 46, 127)


## Build the model

In [34]:
# n_classes = len(idx2la)
# n_vocab = len(idx2w)

max_features = len(w2idx)
embedding_size = 100
hidden_size = 32
out_size = len(labels2idx)
batch_size = 32
epochs = 30

In [35]:
# Define model
model = Sequential()
# model.add(Embedding(n_vocab,100))
model.add(Embedding(input_dim=max_features, output_dim=embedding_size,
                    input_length=maxlen, mask_zero=False))
model.add(Convolution1D(64,5,padding='same', activation='relu'))
model.add(Dropout(0.25))
model.add(GRU(100,return_sequences=True))
model.add(TimeDistributed(Dense(out_size, activation='softmax')))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 46, 100)           57200     
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 46, 64)            32064     
_________________________________________________________________
dropout_3 (Dropout)          (None, 46, 64)            0         
_________________________________________________________________
gru_3 (GRU)                  (None, 46, 100)           49500     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 46, 127)           12827     
Total params: 151,591.0
Trainable params: 151,591.0
Non-trainable params: 0.0
_________________________________________________________________


## Train the model 

In [36]:
model.compile('rmsprop', 'categorical_crossentropy')

In [37]:
filepath = "models/NER-ATIS-{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
earlystopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto')
callbacks_list = [checkpoint, earlystopping]

In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_data=(X_val, y_val), callbacks=callbacks_list)

Train on 23485 samples, validate on 5870 samples
Epoch 1/30

## Evaluate the model

In [None]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('Raw test score:', score)

In [None]:
def score(yh, pr):
    coords = [np.where(yhh > 0)[0][0] for yhh in yh]
    yh = [yhh[co:] for yhh, co in zip(yh, coords)]
    ypr = [prr[co:] for prr, co in zip(pr, coords)]
    fyh = [c for row in yh for c in row]
    fpr = [c for row in ypr for c in row]
    return fyh, fpr

In [None]:
# On the test set 
pr = model.predict(X_test)
pr = pr.argmax(2)
print(pr.shape)
print(pr[0])
print(pr[0][0])
yh = y_test.argmax(2)
print(yh.shape)
print(yh[0])

In [None]:
fyh, fpr = score(yh, pr)
print('Testing accuracy:', accuracy_score(fyh, fpr))
print('Testing confusion matrix:')
print(confusion_matrix(fyh, fpr))