In [1]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, SimpleRNN
from keras.utils import np_utils
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_file = './data764/eng.train'
test_file = './data764/eng.testa'

with open(train_file) as f:
    train_data = f.readlines()
    
with open(test_file) as f:
    test_data = f.readlines()

In [3]:
train_data[0].split()

['-DOCSTART-', '-X-', 'O', 'O']

In [4]:
train_words = []
train_pos = []
train_chunk = []
train_entity = []
for d in train_data:
    data = d.split()
    if (len(data) == 4):
        train_words.append(data[0])
        train_pos.append(data[1])
        train_chunk.append(data[2])
        entity = data[3]
        if (entity == 'I-LOC' or entity == 'B-LOC'):
            entity = 'LOC'
        elif (entity == 'I-PER' or entity == 'B-PER'):
            entity = 'PER'
        elif (entity == 'I-ORG' or entity == 'B-ORG'):
            entity = 'ORG'
        elif (entity == 'I-MISC' or entity == 'B-MISC'):
            entity = 'MISC'
        train_entity.append(entity)

In [5]:
test_words = []
test_pos = []
test_chunk = []
test_entity = []
for d in test_data:
    data = d.split()
    if (len(data) == 4):
        test_words.append(data[0])
        test_pos.append(data[1])
        test_chunk.append(data[2])
        entity = data[3]
        if (entity == 'I-LOC' or entity == 'B-LOC'):
            entity = 'LOC'
        elif (entity == 'I-PER' or entity == 'B-PER'):
            entity = 'PER'
        elif (entity == 'I-ORG' or entity == 'B-ORG'):
            entity = 'ORG'
        elif (entity == 'I-MISC' or entity == 'B-MISC'):
            entity = 'MISC'
        test_entity.append(entity)
#test_entity


In [6]:
number_of_labels = set(train_entity+test_entity)

In [7]:
unique_chars = set()
for word in (train_words + test_words):
    for char in word:
        unique_chars.add(char)

In [8]:
char_dict = {}
for i, char in enumerate(unique_chars):
    char_dict[char] = i + 1

In [9]:
def convert_word_2_vec(input_words):
    output_words = []
    for word in input_words:
        word_vec = []
        for char in word:
            word_vec.append(char_dict[char])
        output_words.append(word_vec)
    return output_words

In [10]:
train_word_vectors = convert_word_2_vec(train_words)
test_word_vectors = convert_word_2_vec(test_words)

In [11]:
# find word of maximum length
max_word_length = 0
for word in (train_words + test_words):
    if len(word) > max_word_length:
        max_word_length = len(word)

In [12]:
train_one_hot = preprocessing.LabelEncoder().fit_transform(train_entity)
train_one_hot = np_utils.to_categorical(train_one_hot)
test_one_hot = preprocessing.LabelEncoder().fit_transform(test_entity)
test_one_hot = np_utils.to_categorical(test_one_hot)

In [13]:
train_words_padded = sequence.pad_sequences(train_word_vectors, maxlen=max_word_length)
test_words_padded = sequence.pad_sequences(test_word_vectors, maxlen=max_word_length)
train_entities = np.array(train_one_hot)
test_entities = np.array(test_one_hot)

In [14]:
batch_size = 32
epochs = 15

model = Sequential()
model.add(Embedding(100, 128, mask_zero=True, input_length=max_word_length))
model.add(SimpleRNN(64))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

model.fit(train_words_padded, train_entities,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=[test_words_padded, test_entities])

Train on 204567 samples, validate on 51578 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1a3f9e7c18>

In [15]:


test_prediction = model.predict(test_words_padded)

test_pred= []
for pred in test_prediction:
    index = np.argmax(pred)
    if index == 0:
        test_pred.append('LOC')
    elif index == 1:
        test_pred.append('MISC')
    elif index == 2:
        test_pred.append('O')
    elif index == 3:
        test_pred.append('ORG')
    elif index == 4:
        test_pred.append('PER')



In [16]:
from sklearn.metrics import precision_recall_fscore_support

precision_recall_fscore_support(test_entity, test_pred)

(array([0.739238  , 0.80346232, 0.98238414, 0.56059701, 0.69348442]),
 array([0.71346705, 0.62223975, 0.99141361, 0.44885277, 0.77738965]),
 array([0.72612394, 0.70133333, 0.98687822, 0.49853995, 0.73304387]),
 array([ 2094,  1268, 42975,  2092,  3149]))