Part of speech tagging dataset reference: http://www.cnts.ua.ac.be/conll2000/chunking/

In [36]:
import numpy as np
import pickle

from sklearn.metrics import confusion_matrix, accuracy_score

from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, GRU, LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D, MaxPooling1D

from keras.callbacks import ModelCheckpoint, EarlyStopping

## Load the data 

In [12]:
with open('data/pos_conll.pkl', 'rb') as f:
    data = pickle.load(f)

In [13]:
X_train = data['train']['X'] 
tags_train = data['train']['tags'] 
chunks_train = data['train']['chunks'] 

X_test = data['test']['X'] 
tags_test = data['test']['tags'] 
chunks_test = data['test']['chunks'] 

maxlen = data['stats']['maxlen']
word2ind = data['stats']['word2ind']
ind2word = data['stats']['ind2word'] 
label2ind = data['stats']['label2ind'] 
ind2label = data['stats']['ind2label'] 

In [47]:
# # a little bit of investigation 

# print(ind2word[1], end='\n\n')
# print(ind2label)  
# print()   # do i need to plus one for word dict?  yes 
print(len(word2ind))

21589


## Map to indices

In [28]:
def encode_one_hot(idx, dim):
    temp = [0]*dim
    temp[idx] = 1
    return temp

def encode_corpus(X, maxlen):
    X_enc = [[word2ind[word] for word in x] for x in X]
    return pad_sequences(X_enc, maxlen=maxlen, value=0)

def encode_labels(Y, maxlen, dim):
    Y_enc = [[label2ind[tag] for tag in y] for y in Y]
    Y_enc = pad_sequences(Y_enc, maxlen=maxlen, value=0)
    Y_enc = [[encode_one_hot(idx, dim) for idx in y] for y in Y_enc]
    return np.array(Y_enc)

In [29]:
dim = len(ind2label) + 1
print(dim)

X_train_enc = encode_corpus(X_train, maxlen)
y_train_enc = encode_labels(tags_train, maxlen, dim)

45


In [32]:
# print(X_train_enc.shape)
# print(type(X_train_enc), type(X_train_enc[0]))
# print(y_train_enc.shape)

(8936, 78)
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(8936, 78, 45)


In [40]:
validation_split = 0.1

X_enc = X_train_enc
y_enc = y_train_enc

indices = np.arange(X_enc.shape[0])
np.random.shuffle(indices)
X_enc = X_enc[indices]
y_enc = y_enc[indices]
num_validation_samples = int(validation_split * X_enc.shape[0])

X_train_enc = X_enc[:-num_validation_samples]
y_train_enc = y_enc[:-num_validation_samples]
X_val_enc = X_enc[-num_validation_samples:]
y_val_enc = y_enc[-num_validation_samples:]

In [41]:
print(X_train_enc.shape)
print(y_train_enc.shape)
print(X_val_enc.shape)
print(y_val_enc.shape)

(8043, 78)
(8043, 78, 45)
(893, 78)
(893, 78, 45)


## Model 

In [34]:
max_features = len(word2ind)+1
embedding_size = 100
hidden_size = 32
out_size = len(label2ind) + 1
batch_size = 32
epochs = 10

In [37]:
# Define model
model = Sequential()
# model.add(Embedding(n_vocab,100))
model.add(Embedding(input_dim=max_features, output_dim=embedding_size,
                    input_length=maxlen, mask_zero=False))
model.add(Convolution1D(64,5,padding='same', activation='relu'))
model.add(Dropout(0.25))
model.add(GRU(100,return_sequences=True))
model.add(TimeDistributed(Dense(out_size, activation='softmax')))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 78, 100)           2159000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 78, 64)            32064     
_________________________________________________________________
dropout_1 (Dropout)          (None, 78, 64)            0         
_________________________________________________________________
gru_1 (GRU)                  (None, 78, 100)           49500     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 78, 45)            4545      
Total params: 2,245,109.0
Trainable params: 2,245,109.0
Non-trainable params: 0.0
_________________________________________________________________


### Training 

In [38]:
model.compile('rmsprop', 'categorical_crossentropy')

In [39]:
filepath = "models/POS-conll-{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
earlystopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=1, mode='auto')
callbacks_list = [checkpoint, earlystopping]

In [43]:
model.fit(X_train_enc, y_train_enc, batch_size=batch_size, epochs=epochs,
          validation_data=(X_val_enc, y_val_enc), callbacks=callbacks_list)

Train on 8043 samples, validate on 893 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 00009: early stopping


<keras.callbacks.History at 0x11cb4ce80>

### Evaluation

In [44]:
model = load_model('models/POS-conll-05-0.05.hdf5')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 78, 100)           2159000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 78, 64)            32064     
_________________________________________________________________
dropout_1 (Dropout)          (None, 78, 64)            0         
_________________________________________________________________
gru_1 (GRU)                  (None, 78, 100)           49500     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 78, 45)            4545      
Total params: 2,245,109.0
Trainable params: 2,245,109.0
Non-trainable params: 0.0
_________________________________________________________________


In [45]:
# constructing test data 

X_test_enc = encode_corpus(X_test, maxlen)
y_test_enc = encode_labels(tags_test, maxlen, dim)

print(X_test_enc.shape)
print(y_test_enc.shape)

(2012, 78)
(2012, 78, 45)


In [46]:
print(X_test_enc[:2])
score = model.evaluate(X_test_enc, y_test_enc, batch_size=batch_size, verbose=0)
print('Raw test score:', score)

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0 17692  1454 19800 21309  9035 13876  1934 15656 13811 10779
  15966   997  9516 16317 16582 16184  6588  9370 10420 17953 11716  2722
  14023  6588 21309  7632   101 21174]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0 17692  1934  5048   997 19679 14023 15656 10420 14791 19443 10113
  16438  4453 14023  5048  2202 21174]]
Raw test score: 0.0671464220934


In [51]:
def score(yh, pr):
    coords = [np.where(yhh > 0)[0][0] for yhh in yh]
    yh = [yhh[co:] for yhh, co in zip(yh, coords)]
    ypr = [prr[co:] for prr, co in zip(pr, coords)]
    fyh = [c for row in yh for c in row]
    fpr = [c for row in ypr for c in row]
    return fyh, fpr

In [49]:
# On the test set 
pr = model.predict(X_test_enc)
pr = pr.argmax(2)
print(pr.shape)
print(pr[0])
print(pr[0][0])
yh = y_test_enc.argmax(2)
print(yh.shape)
print(yh[0])

(2012, 78)
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
 27 27 27  3 27 26  1 14  1  6 42 26 15 39 26  4 27 27 17 19 42 20  4 27  3
 27 26 13]
0
(2012, 78)
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
 27 27 27  3 27 26  1 14  1  6 42 26 15 39 26  4 27 27 17 19 42 20  4 27  3
 28 20 13]


In [52]:
fyh, fpr = score(yh, pr)
print('Testing accuracy:', accuracy_score(fyh, fpr))
print('Testing confusion matrix:')
print(confusion_matrix(fyh, fpr))

Testing accuracy: 0.945923127256
Testing confusion matrix:
[[1594    0    0 ...,    0    0    0]
 [   0    4    0 ...,    0    0    0]
 [   0    0  434 ...,    0    0    0]
 ..., 
 [   1    0    0 ..., 2639    0    0]
 [   0    0    0 ...,    1   40    0]
 [   0    0    0 ...,    0    0 2390]]
