# Named Entity Recognition using Bi-LSTM or 1D CNN
- https://wikidocs.net/95838

In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import tensorflow as tf
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import string
import os
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

tqdm.pandas()

In [2]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")

data = data.fillna(method="ffill")
data["Word"] = data["Word"].str.lower()

In [3]:
data.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,london,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [4]:
idx_order = data["Sentence #"].drop_duplicates().tolist()

corpus = data.groupby(["Sentence #"])["Word"].progress_apply(list).loc[idx_order].tolist()
tags = data.groupby(["Sentence #"])["Tag"].progress_apply(list).loc[idx_order].tolist()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=47959.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=47959.0), HTML(value='')))




In [6]:
tkn = tf.keras.preprocessing.text.Tokenizer(oov_token="UNK", lower=True)
tkn.fit_on_texts(corpus)
word2idx = tkn.word_index
idx2word = tkn.index_word
word2cnts = dict(sorted(tkn.word_counts.items(), key=lambda x:x[1], reverse=True))

cnts = list(word2cnts.values())
ratio = 0.99
for vocab_size, value in enumerate(np.cumsum(cnts)/np.sum(cnts)):
    if value >= ratio:
        break

print(f"{vocab_size:,}개의 단어로 전체 data의 {ratio:.0%}를 표현할 수 있습니다.")
print(f"{len(word2idx):,}개의 단어 중 {vocab_size/len(word2idx):.1%}에 해당합니다.")

21,331개의 단어로 전체 data의 99%를 표현할 수 있습니다.
31,818개의 단어 중 67.0%에 해당합니다.


In [247]:
tkn_word = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="UNK", lower=True)
tkn_word.fit_on_texts(corpus)

X = tkn_word.texts_to_sequences(corpus)

tkn_tag = tf.keras.preprocessing.text.Tokenizer(lower=False)
tkn_tag.fit_on_texts(tags)
tag2idx = tkn_tag.word_index
idx2tag = tkn_tag.index_word

y = tkn_tag.texts_to_sequences(tags)

In [248]:
lens = sorted([len(doc) for doc in X])
for idx, max_len in enumerate(lens):
    if idx/len(lens) >= ratio:
        break
print(f"가장 긴 문장의 길이는 {np.max(lens)}입니다.")
print(f"길이가 {max_len} 이하인 문장이 전체의 {ratio:.0%}를 차지합니다.")

가장 긴 문장의 길이는 104입니다.
길이가 43 이하인 문장이 전체의 99%를 차지합니다.


In [249]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=max_len, padding="post")
y = tf.keras.preprocessing.sequence.pad_sequences(y, maxlen=max_len, padding="post")

In [48]:
X_train, X_test, y_train, y_test, X_char_train, X_char_test = train_test_split(X, y, X_char, test_size=0.2, random_state=777)

## 1. Using Bi-LSTM

In [63]:
model_path = "ner_dataset_BiLSTM.h5"
if os.path.exists(model_path):
    model = tf.keras.models.load_model(model_path)
else:
    model = tf.keras.Sequential()
    emb_dim = 128
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size+2, output_dim=emb_dim, mask_zero=True))
    h_size = 256
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=h_size, return_sequences=True)))
    model.add(tf.keras.layers.Dense(units=len(tag2idx)+1, activation="softmax"))

    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["sparse_categorical_accuracy"])

    es = tf.keras.callbacks.EarlyStopping(monitor="val_loss", mode="auto", verbose=1, patience=3)
    mc = tf.keras.callbacks.ModelCheckpoint(filepath=model_path, monitor="val_sparse_categorical_accuracy", mode="auto", verbose=1, save_best_only=True)
    hist = model.fit(X_train, y_train, batch_size=128, epochs=16,  validation_split=0.1, callbacks=[es, mc])

In [104]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 128)         2730624   
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 512)         788480    
_________________________________________________________________
dense_4 (Dense)              (None, None, 18)          9234      
Total params: 3,528,338
Trainable params: 3,528,338
Non-trainable params: 0
_________________________________________________________________


In [203]:
i = 4
y_pred = model.predict(X_test[i])
y_pred = np.argmax(y_pred, axis=2).reshape((43))

print(f"        단어        | 실제값 | 예측값")
for word, true, pred in zip(X_test[i], y_test[i], y_pred):
    if word != 0: # PAD값은 제외함.
        print(f"{idx2word[word]:20s}| {idx2tag[true]:7s}| {idx2tag[pred]}")

        단어        | 실제값 | 예측값


KeyError: 0

In [141]:
# epochs = range(1, len(hist.history["val_loss"]) + 1)
# plt.plot(epochs, hist.history["loss"])
# plt.plot(epochs, hist.history["val_loss"])
# plt.title("model loss")
# plt.ylabel("loss")
# plt.xlabel("epoch")
# plt.legend(["train", "val"], loc="upper left")

In [148]:
idx2tag

{1: 'O',
 2: 'B-geo',
 3: 'B-tim',
 4: 'B-org',
 5: 'I-per',
 6: 'B-per',
 7: 'I-org',
 8: 'B-gpe',
 9: 'I-geo',
 10: 'I-tim',
 11: 'B-art',
 12: 'B-eve',
 13: 'I-art',
 14: 'I-eve',
 15: 'B-nat',
 16: 'I-gpe',
 17: 'I-nat'}

In [198]:
def seqs_to_tag(seqs): # 예측값을 idx2tag를 사용하여 태깅 정보로 변경하는 함수.
    result = []
    for seq in seqs:
        temp = []
        for idx in seq:
            if idx != 0:
                temp.append(idx2tag[idx])
#             else:
#                 temp.append(0)
        result.append(temp)
    return result

In [207]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=2)

y_true = y_test

In [212]:
i = 1000
print(y_pred[i])
print(y_true[i])

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1]


In [197]:
y_true[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 2, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [201]:
print(seqs_to_tag(y_true))

[['O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O'], ['B-org', 'I-org', 'I-org', 'B-per', 'I-per', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo', 'O', 'O', 'B-org', 'I-org', 'O', 'B-geo', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'B-per', 'I-per', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '

In [202]:
print(seqs_to_tag(y_pred))

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

In [194]:
tkn.sequences_to_texts([y_pred[1]])

['O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-per B-geo O O O O O O O O O O O O O']

In [195]:
tkn.sequences_to_texts([y_true[1]])

['O O O O O O B-geo I-geo O O O O O O O O O O O O O']

In [183]:
seqs_to_tag(y_true[0])

TypeError: 'numpy.int32' object is not iterable

In [184]:
y_true[0][0]

0

In [151]:
y_pred = model.predict([X_test])
pred_tags = sequences_to_tag(y_pred)
test_tags = sequences_to_tag(y_test)

KeyError: 0

In [166]:
seqeval.metrics

AttributeError: module 'seqeval' has no attribute 'metrics'

In [167]:
print(seqeval.metrics.classification_report(y_pred, y_test))

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U21'), dtype('<U21')) -> dtype('<U21')

In [None]:
print("F1-score: {:.1%}".format(f1_score(test_tags, pred_tags)))

F1-score: 75.5%


## 2. using BiLSTM-CNN

### char 정보를 사용하기 위한 추가 전처리
- 개체명 인식기의 성능을 올리기 위한 여러 시도들이 있었습니다. 그 중 하나는 워드 임베딩 외에 글자 단위의 임베딩을 사용하여 모델의 성능을 높이는 방법입니다. 이번 챕터에서는 워드 임베딩과 함께 입력으로 사용될 수 있는 CNN 기반의 글자 임베딩과 LSTM 기반의 글자 임베딩에 대해서 이해해봅시다.
- 글자 단위 임베딩을 위해서 여기서 최종적으로 하고자 하는 전처리는 글자 단위 정수 인코딩입니다. 글자 단위 정수 인코딩이란, 가령 단어 'book'이 있고, b가 21번 o가 7번, k가 11번이라고 한다면 단어 'book'을 [21 7 7 11]로 인코딩하는 것입니다. 만약 단어 1개가 아니라 단어구나 문장이라면 어떨까요? 'good book'이란 문장이 있고, g가 12번, d가 17번이라고 한다면 이 문장을 글자 단위 정수 인코딩한다면 다음과 같은 결과를 얻을 수 있습니다.

'good book의 정수 인코딩 결과'
[[12 7 7 17]
[21 7 7 11]]

In [27]:
data[data["Word"].str.contains("\xa0")]

Unnamed: 0,Sentence #,Word,POS,Tag
1040691,Sentence: 47592,5 storm,NN,O


In [28]:
chars = set()
for punct in string.punctuation:
    chars.update(punct)
for char in string.ascii_lowercase:
    chars.update(char)
for num in range(9):
    chars.update(str(num))

In [29]:
print(chars)

{'}', '[', '>', '~', ')', 'q', 'g', '.', '&', '"', '/', 'h', 't', 'u', '_', 'e', '3', '-', '8', 'x', '(', '`', 'a', 'c', 'b', ',', '=', '@', '!', 'f', 'r', '5', 'z', '?', '<', '+', 'l', '%', '2', ';', 'o', 'm', '0', 'd', 'n', "'", 's', 'p', '^', '4', '#', '$', ':', '{', '|', '*', 'i', 'v', '1', 'j', 'w', 'y', '7', ']', '\\', 'k', '6'}


In [30]:
char2idx = {}
char2idx["UNK"] = 1
char2idx.update({char:idx+2 for idx, char in enumerate(chars)})
# char_to_index["PAD"] = 0

idx2char = {}
for key, value in char2idx.items():
    idx2char[value] = key

In [32]:
print(idx2char)

{1: 'UNK', 2: '}', 3: '[', 4: '>', 5: '~', 6: ')', 7: 'q', 8: 'g', 9: '.', 10: '&', 11: '"', 12: '/', 13: 'h', 14: 't', 15: 'u', 16: '_', 17: 'e', 18: '3', 19: '-', 20: '8', 21: 'x', 22: '(', 23: '`', 24: 'a', 25: 'c', 26: 'b', 27: ',', 28: '=', 29: '@', 30: '!', 31: 'f', 32: 'r', 33: '5', 34: 'z', 35: '?', 36: '<', 37: '+', 38: 'l', 39: '%', 40: '2', 41: ';', 42: 'o', 43: 'm', 44: '0', 45: 'd', 46: 'n', 47: "'", 48: 's', 49: 'p', 50: '^', 51: '4', 52: '#', 53: '$', 54: ':', 55: '{', 56: '|', 57: '*', 58: 'i', 59: 'v', 60: '1', 61: 'j', 62: 'w', 63: 'y', 64: '7', 65: ']', 66: '\\', 67: 'k', 68: '6'}


In [259]:
max_len_char = 15
X_char = [tf.keras.preprocessing.sequence.pad_sequences([[char2idx[char] if char in chars else 1 for char in word] for word in sent], maxlen=max_len_char, padding="post") for sent in corpus]

In [260]:
X[0]

array([ 254,    6,  967,   16, 1795,  238,  468,    7,  523,    2,  129,
          5,   61,    9,  571,    2,  833,    6,  186,   90,   22,   15,
         56,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [261]:
X_char[0]

array([[14, 13, 42, 15, 48, 24, 46, 45, 48,  0,  0,  0,  0,  0,  0],
       [42, 31,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [45, 17, 43, 42, 46, 48, 14, 32, 24, 14, 42, 32, 48,  0,  0],
       [13, 24, 59, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [43, 24, 32, 25, 13, 17, 45,  0,  0,  0,  0,  0,  0,  0,  0],
       [14, 13, 32, 42, 15,  8, 13,  0,  0,  0,  0,  0,  0,  0,  0],
       [38, 42, 46, 45, 42, 46,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [14, 42,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [49, 32, 42, 14, 17, 48, 14,  0,  0,  0,  0,  0,  0,  0,  0],
       [14, 13, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [62, 24, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [58, 46,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [58, 32, 24,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [24, 46, 45,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [45, 17, 43, 24, 46, 45,  0

- 위 출력 결과에서 각 행은 각 단어를 의미합니다. 가령, thousands는 첫번째 행 [53 41 48 54 52 34 47 37 52 0 0 0 0 0 0]에 해당됩니다. 단어의 최대 길이를 15(max_len_char)로 제한하였으므로, 길이가 15보다 짧은 단어는 뒤에 0으로 패딩됩니다. 53은 t, 41은 h, 48은 o, 54는 u에 각각 해당됩니다. X_data는 뒤에 0으로 패딩되어 길이가 70인 것에 비해, X_char_data는 0번 단어는 무시되어 길이가 70이 아닙니다. 즉, 위 출력 결과에서 행의 개수가 70이 아닌 상태입니다. 이를 위해 문장 길이 방향으로도 패딩을 해줍니다.

In [264]:
X_char = tf.keras.preprocessing.sequence.pad_sequences(X_char, maxlen=max_len, padding="post")

In [265]:
X_char[0]

array([[14, 13, 42, 15, 48, 24, 46, 45, 48,  0,  0,  0,  0,  0,  0],
       [42, 31,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [45, 17, 43, 42, 46, 48, 14, 32, 24, 14, 42, 32, 48,  0,  0],
       [13, 24, 59, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [43, 24, 32, 25, 13, 17, 45,  0,  0,  0,  0,  0,  0,  0,  0],
       [14, 13, 32, 42, 15,  8, 13,  0,  0,  0,  0,  0,  0,  0,  0],
       [38, 42, 46, 45, 42, 46,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [14, 42,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [49, 32, 42, 14, 17, 48, 14,  0,  0,  0,  0,  0,  0,  0,  0],
       [14, 13, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [62, 24, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [58, 46,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [58, 32, 24,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [24, 46, 45,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [45, 17, 43, 24, 46, 45,  0

In [266]:
X_train, X_test, y_train, y_test, X_char_train, X_char_test = train_test_split(X, y, X_char, test_size=0.2, random_state=777)

In [None]:
model_path = "ner_dataset_BiLSTM.h5"
if os.path.exists(model_path):
    model = tf.keras.models.load_model(model_path)
else:
    model = tf.keras.Sequential()
    emb_dim = 128
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size+2, output_dim=emb_dim, mask_zero=True))
    h_size = 256
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=h_size, return_sequences=True)))
    model.add(tf.keras.layers.Dense(units=len(tag2idx)+1, activation="softmax"))

In [283]:
inputs_word = tf.keras.Input(shape=(max_len,), dtype="int32", name="Input_word")
logits_word = tf.keras.layers.Embedding(input_dim=vocab_size+2, output_dim=64)(inputs_word)

inputs_char = tf.keras.Input(shape=(max_len, max_len_char), name="Input_char")
embs_char = tf.keras.layers.Embedding(input_dim=len(tag2idx)+1, output_dim=32, embeddings_initializer=tf.keras.initializers.RandomUniform(minval=-0.5, maxval=0.5), name="Embedding_char")(inputs_char)
z = tf.keras.layers.Dropout(rate=0.5)(embs_char)
z = tf.keras.layers.Conv1D(filters=30, kernel_size=3, padding="same", activation="tanh", strides=1)(z)
z = tf.keras.layers.MaxPool1D(pool_size=max_len_char)(z)
z = tf.keras.layers.Flatten()(z)
logits_char = tf.keras.layers.Dropout(rate=0.5)(z)

z = tf.concat([logits_word, logits_char], axis=3)
z = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=50, return_sequences=True, dropout=0.5, recurrent_dropout=0.25))(z)
outputs = tf.keras.layers.Dense(units=len(tag2idx), activation="softmax")(z)

model = tf.keras.Model(inputs=[inputs_word, inputs_char], outputs=[outputs])

model.summary()

ValueError: Input 0 of layer max_pooling1d_13 is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: [None, 43, 15, 30]

In [282]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["sparse_categorical_accuracy"])

ValueError: Input 0 of layer max_pooling1d_12 is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: [None, 43, 15, 30]

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('bilstm_cnn.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
history = model.fit([X_train, X_char_train], y_train, batch_size = 128, epochs = 30, validation_split = 0.1, verbose = 1, callbacks=[es, mc])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 34530 samples, validate on 3837 samples
Epoch 1/30
 1024/34530 [..............................] - ETA: 2:10 - loss: 1.9118 - acc: 0.5808

KeyboardInterrupt: ignored