In [1]:
import nltk

nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_pereptron_tagger')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\bitcamp\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bitcamp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading averaged_pereptron_tagger: Package
[nltk_data]     'averaged_pereptron_tagger' not found in index
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\bitcamp\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [55]:
from nltk import word_tokenize, pos_tag, ne_chunk

sentene = 'james is working at Disney in London'
sentene = pos_tag(word_tokenize(sentene))
print(sentene)

[('james', 'NNS'), ('is', 'VBZ'), ('working', 'VBG'), ('at', 'IN'), ('Disney', 'NNP'), ('in', 'IN'), ('London', 'NNP')]


In [56]:
sentene = ne_chunk(sentene)

print(sentene)

(S
  james/NNS
  is/VBZ
  working/VBG
  at/IN
  (ORGANIZATION Disney/NNP)
  in/IN
  (GPE London/NNP))


In [57]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
import urllib.request

In [58]:
tagged_sentences = []
sentence = []

with urllib.request.urlopen('https://raw.githubusercontent.com/Franck-Dernoncourt/NeuroNER/master/neuroner/data/conll2003/en/train.txt') as f:
    for line in f:
        line = line.decode('utf-8')
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == '\n':
            if len(sentence) > 0:
                tagged_sentences.append(sentence)
                sentence = []
            continue
        splits = line.strip().split(' ')
        word = splits[0].lower()
        sentence.append([word, splits[-1]])

print(len(tagged_sentences))
print(tagged_sentences[0])

                

14041
[['eu', 'B-ORG'], ['rejects', 'O'], ['german', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['british', 'B-MISC'], ['lamb', 'O'], ['.', 'O']]


In [59]:
sentences, ner_tags = [], []

for tagged_sentence in tagged_sentences:
    sentence, tag_info = zip(*tagged_sentence)
    sentences.append(list(sentence))
    ner_tags.append(list(tag_info))

In [60]:
max_words = 4000
src_tokenizer = Tokenizer(num_words=max_words, oov_token='OOV')
src_tokenizer.fit_on_texts(sentences)

tar_tokenizer = Tokenizer()
tar_tokenizer.fit_on_texts(ner_tags)

In [61]:
vocab_size = max_words
tag_size = len(tar_tokenizer.word_index) + 1

print(vocab_size)
print(tag_size)

4000
10


In [76]:
X_train = src_tokenizer.texts_to_sequences(sentences)
Y_train = tar_tokenizer.texts_to_sequences(ner_tags)

In [79]:
max_len = 70
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
Y_train = pad_sequences(Y_train, padding='post', maxlen=max_len)

In [80]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=.2, random_state=111)

Y_train = to_categorical(Y_train, num_classes=tag_size)
Y_test = to_categorical(Y_test, num_classes=tag_size)

In [81]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(7188, 70)
(7188, 70, 10)
(1797, 70)
(1797, 70, 10)


In [82]:
from keras.models import Sequential 
from keras.layers import Dense, Embedding, LSTM, Bidirectional, TimeDistributed
from tensorflow.keras.optimizers import Adam

In [83]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))

model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 70, 128)           512000    
                                                                 
 bidirectional_8 (Bidirectio  (None, 70, 512)          788480    
 nal)                                                            
                                                                 
 time_distributed_6 (TimeDis  (None, 70, 10)           5130      
 tributed)                                                       
                                                                 
Total params: 1,305,610
Trainable params: 1,305,610
Non-trainable params: 0
_________________________________________________________________


In [84]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
model.fit(X_train, Y_train, batch_size=128, epochs=3, validation_data=(X_test, Y_test))


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x17bbdee1dd8>

In [97]:
model.evaluate(X_test, Y_test)



[0.07480484992265701, 0.8884502649307251]

In [98]:
idx2word = src_tokenizer.index_word
idx2ner = tar_tokenizer.index_word
idx2ner[0] = 'PAD'

In [103]:
i = 70
Y_predicted = model.predict(np.array([X_test[i]]))
Y_predicted = np.argmax(Y_predicted, axis=-1)
true = np.argmax(Y_test[i], -1)

print('{:15}|{:5}|{}'.format('단어','실제값', '예측값'))
print('-' * 34)

for w, t, pred in zip(X_test[i], true, Y_predicted[0]):
    if w != 0:
        print('{:17}: {:7} {}'.format(idx2word[w], idx2ner[t].upper(), idx2ner[pred].upper()))


단어             |실제값  |예측값
----------------------------------
ballanger        : B-PER   B-ORG
beat             : O       O
germany          : B-LOC   B-LOC
's               : O       O
OOV              : B-PER   O
OOV              : I-PER   O
2-0              : O       O
in               : O       O
the              : O       O
OOV              : O       O
matches          : O       O
final            : O       O
to               : O       O
add              : O       O
the              : O       O
world            : O       O
title            : O       O
to               : O       O
the              : O       O
olympic          : B-MISC  B-MISC
gold             : O       O
medal            : O       O
she              : O       O
won              : O       O
in               : O       O
july             : O       O
.                : O       O
