In [1]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, SimpleRNN
from keras.utils import np_utils
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import math

train_gs_file = './data764/NEEL2016-training_neel.gs'
train_tsv_file = './data764/NEEL2016-training.tsv'
test_gs_file = './data764/NEEL2016-test_neel.gs'
test_tsv_file = './data764/NEEL2016-test.tsv'

#col_names=['tweet_id','start','end','uri', 'confidence', 'type']

with open(train_gs_file) as f:
    train_gs_data = f.readlines()
with open(train_tsv_file) as f:
    train_tsv_data = f.readlines()
with open(test_gs_file) as f:
    test_gs_data = f.readlines()
with open(test_tsv_file) as f:
    test_tsv_data = f.readlines()
train_gs_data[0].split()
train_tweet_dict = {}
for line in train_tsv_data:
    data = line.split('|,|')
    if (len(data) == 2):
        tweet_id = data[0][1:]
        tweet_text = data[1].strip()[:-1]
        if (tweet_id is not None and tweet_text is not None):
            train_tweet_dict[tweet_id] = tweet_text
train_word = []
train_entity = []
for line in train_gs_data:
    data = line.split()
    if (len(data) == 6):
        tweet_id = data[0].strip()
        start_index = int(data[1].strip())
        end_index = int(data[2].strip())
        entity = data[5].strip()
        
        if (entity == 'Organization373937812812615000'):
            entity = 'ORG'
        elif (entity == 'Location'):
            entity = 'LOC'
        elif (entity == 'Person'):
            entity = 'PER'
        elif (entity == 'Organization'):
            entity = 'ORG'
        elif (1):
            entity = 'O'   
        train_entity.append(entity)
        
        word = train_tweet_dict[tweet_id][start_index:end_index]
        train_word.append(word)

In [3]:
test_tweet_dict = {}
for line in test_tsv_data:
    data = line.split('|,|')
    if (len(data) == 2):
        tweet_id = data[0][1:]
        tweet_text = data[1].strip()[:-1]
        if (tweet_id is not None and tweet_text is not None):
            test_tweet_dict[tweet_id] = tweet_text
            
test_word = []
test_entity = []
for line in test_gs_data:
    data = line.split()
    if (len(data) == 6):
        tweet_id = data[0].strip()
        start_index = int(data[1].strip())
        end_index = int(data[2].strip())
        entity = data[5].strip()
        
        if (entity == 'Organization373937812812615000'):
            entity = 'ORG'
        elif (entity == 'Location'):
            entity = 'LOC'
        elif (entity == 'Person'):
            entity = 'PER'
        elif (entity == 'Organization'):
            entity = 'ORG'
        elif (1):
            entity = 'O'  
        test_entity.append(entity)
        
        word = test_tweet_dict[tweet_id][start_index:end_index]
        test_word.append(word)

In [4]:
number_of_labels = set(train_entity+test_entity)

In [5]:
unique_chars = set()
for word in (train_word + test_word):
    for char in word:
        unique_chars.add(char)

In [6]:
char_dict = {}
for i, char in enumerate(unique_chars):
    char_dict[char] = i + 1

In [7]:
def convert_word_2_vec(input_words):
    output_words = []
    for word in input_words:
        word_vec = []
        for char in word:
            word_vec.append(char_dict[char])
        output_words.append(word_vec)
    return output_words

In [8]:
train_word_vectors = convert_word_2_vec(train_word)
test_word_vectors = convert_word_2_vec(test_word)

In [9]:
# find word of maximum length
max_word_length = 0
for word in (train_word + test_word):
    if len(word) > max_word_length:
        max_word_length = len(word)

In [10]:
train_one_hot = preprocessing.LabelEncoder().fit_transform(train_entity)
train_one_hot = np_utils.to_categorical(train_one_hot)
test_one_hot = preprocessing.LabelEncoder().fit_transform(test_entity)
test_one_hot = np_utils.to_categorical(test_one_hot)

In [11]:
train_words_padded = sequence.pad_sequences(train_word_vectors, maxlen=max_word_length)
test_words_padded = sequence.pad_sequences(test_word_vectors, maxlen=max_word_length)
train_entities = np.array(train_one_hot)
test_entities = np.array(test_one_hot)

In [12]:
batch_size = 32
epochs = 5

model = Sequential()
model.add(Embedding(100, 128, mask_zero=True, input_length=max_word_length))
model.add(SimpleRNN(64))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))

model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

model.fit(train_words_padded, train_entities,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=[test_words_padded, test_entities])

Train on 8665 samples, validate on 1022 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a24569cf8>

In [13]:
set(train_entity)



{'LOC', 'O', 'ORG', 'PER'}

In [14]:
test_prediction = model.predict(test_words_padded)

test_pred= []
for pred in test_prediction:
    index = np.argmax(pred)
    if index == 0:
        test_pred.append('LOC')
    elif index == 1:
        test_pred.append('O')
    elif index == 2:
        test_pred.append('ORG')
    elif index == 3:
        test_pred.append('PER')
   

In [15]:
from sklearn.metrics import precision_recall_fscore_support

precision_recall_fscore_support(test_entity, test_pred)

(array([0.21590909, 0.61301989, 0.33088235, 0.62040816]),
 array([0.44186047, 0.70041322, 0.28481013, 0.45103858]),
 array([0.29007634, 0.65380906, 0.30612245, 0.52233677]),
 array([ 43, 484, 158, 337]))