In [1]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPooling2D, Activation, Flatten
from keras.utils import np_utils
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_file = './data764/eng.train'
test_file = './data764/eng.testa'

with open(train_file) as f:
    train_data = f.readlines()
    
with open(test_file) as f:
    test_data = f.readlines()

In [3]:
train_data[0].split()

['-DOCSTART-', '-X-', 'O', 'O']

In [4]:
train_words = []
train_pos = []
train_chunk = []
train_entity = []
for d in train_data:
    data = d.split()
    if (len(data) == 4):
        train_words.append(data[0])
        train_pos.append(data[1])
        train_chunk.append(data[2])
        entity = data[3]
        if (entity == 'I-LOC' or entity == 'B-LOC'):
            entity = 'LOC'
        elif (entity == 'I-PER' or entity == 'B-PER'):
            entity = 'PER'
        elif (entity == 'I-ORG' or entity == 'B-ORG'):
            entity = 'ORG'
        elif (entity == 'I-MISC' or entity == 'B-MISC'):
            entity = 'MISC'
        train_entity.append(entity)

In [5]:
test_words = []
test_pos = []
test_chunk = []
test_entity = []
for d in test_data:
    data = d.split()
    if (len(data) == 4):
        test_words.append(data[0])
        test_pos.append(data[1])
        test_chunk.append(data[2])
        entity = data[3]
        if (entity == 'I-LOC' or entity == 'B-LOC'):
            entity = 'LOC'
        elif (entity == 'I-PER' or entity == 'B-PER'):
            entity = 'PER'
        elif (entity == 'I-ORG' or entity == 'B-ORG'):
            entity = 'ORG'
        elif (entity == 'I-MISC' or entity == 'B-MISC'):
            entity = 'MISC'
        test_entity.append(entity)
#test_entity


In [6]:
number_of_labels = set(train_entity+test_entity)

In [7]:
unique_chars = set()
for word in (train_words + test_words):
    for char in word:
        unique_chars.add(char)

In [8]:
char_dict = {}
for i, char in enumerate(unique_chars):
    char_dict[char] = i + 1

In [9]:
def convert_word_2_vec(input_words):
    output_words = []
    for word in input_words:
        word_vec = []
        for char in word:
            word_vec.append(char_dict[char])
        output_words.append(word_vec)
    return output_words

In [10]:
train_word_vectors = convert_word_2_vec(train_words)
test_word_vectors = convert_word_2_vec(test_words)

In [11]:
# find word of maximum length
max_word_length = 0
for word in (train_words + test_words):
    if len(word) > max_word_length:
        max_word_length = len(word)
max_word_length

61

In [12]:

train_one_hot = preprocessing.LabelEncoder().fit_transform(train_entity)
train_one_hot = np_utils.to_categorical(train_one_hot)
test_one_hot = preprocessing.LabelEncoder().fit_transform(test_entity)
test_one_hot = np_utils.to_categorical(test_one_hot)

In [13]:
train_words_padded = sequence.pad_sequences(train_word_vectors, maxlen=max_word_length)
test_words_padded = sequence.pad_sequences(test_word_vectors, maxlen=max_word_length)
train_entities = np.array(train_one_hot)
test_entities = np.array(test_one_hot)
#train_feature = np.expand_dims(np.stack(train_words_padded, axis=1), axis=1)
#test_feature = np.expand_dims(np.stack(test_words_padded, axis=1), axis=1)
train_feature = np.expand_dims(train_words_padded, axis=2)
test_feature = np.expand_dims(test_words_padded, axis=2)
#train_feature = train_words_padded
#test_feature = test_words_padded

train_entities.shape

(204567, 5)

In [14]:
test_feature.shape

(51578, 61, 1)

In [15]:


#CNN
batch_size = 128
input_shape=(61,1)
epochs = 10
model = Sequential()
model.add(Conv1D(32, kernel_size=3, strides=1, padding='same', activation='relu',input_shape=input_shape))
# model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 1)))
model.add(Conv1D(64, kernel_size=3, strides=1, padding='same', activation='relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)), strides=(2, 1))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(units=5, activation='softmax'))

'''model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.SGD(lr=0.05),
              metrics=['accuracy'])
'''

model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

model.fit(train_feature, train_entities,
          batch_size=128,
          epochs=20,
          verbose=1,
          validation_data=(test_feature, test_entities)
          )

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 204567 samples, validate on 51578 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a1e8c4128>

In [16]:


test_prediction = model.predict(test_feature)

test_pred= []
for pred in test_prediction:
    index = np.argmax(pred)
    if index == 0:
        test_pred.append('LOC')
    elif index == 1:
        test_pred.append('MISC')
    elif index == 2:
        test_pred.append('O')
    elif index == 3:
        test_pred.append('ORG')
    elif index == 4:
        test_pred.append('PER')



In [17]:
from sklearn.metrics import precision_recall_fscore_support

precision_recall_fscore_support(test_entity, test_pred)

(array([0.75769613, 0.74906716, 0.94922474, 0.6037037 , 0.7128    ]),
 array([0.72874881, 0.63328076, 0.98008144, 0.46749522, 0.56589393]),
 array([0.7429406 , 0.68632479, 0.96440633, 0.52693966, 0.63090813]),
 array([ 2094,  1268, 42975,  2092,  3149]))