In [None]:
! pip install tensorflow_hub

In [None]:
import re
import os
import matplotlib.pyplot as plt
import tensorflow_hub as hub
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
# Google TensorFlow에서 제공하는 한국어 Embedding 모델
# Korean Google News (15B Corpus)활용
embed = hub.load("https://tfhub.dev/google/tf2-preview/nnlm-ko-dim50/1")

In [None]:
# Train Data List
path = 'data/train'
filelist = [ x for x in os.listdir(path) if x[0] != '.']
filelist

In [None]:
tagged_sentences = []
sentence = []

for file in filelist:
    print(file)
    f = open(os.path.join(path, file), 'r')
    
    for line in f:
        if len(line)>2:
            splits = line.split('\t')
            splits[1] = re.sub(r'\n', '', splits[1])
            sentence.append([splits[0], splits[1]])
        else:
            tagged_sentences.append(sentence)
            sentence = []
    f.close()

In [None]:
np.shape(tagged_sentences)

In [None]:
tagged_sentences[10]

In [None]:
print(tagged_sentences[0])

In [None]:
sentences, ner_tags = [], [] 
for tagged_sentence in tagged_sentences:
    sentence, tag_info = zip(*tagged_sentence)
    sentences.append(list(sentence))
    ner_tags.append(list(tag_info))

In [None]:
print(sentences[0])
print(ner_tags[0])

In [None]:
print(sentences[12])
print(ner_tags[12])

In [None]:
print('샘플의 최대 길이 : %d' % max(len(l) for l in sentences))
print('샘플의 평균 길이 : %f' % (sum(map(len, sentences))/len(sentences)))
plt.hist([len(s) for s in sentences], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
src_tokenizer = Tokenizer()
src_tokenizer.fit_on_texts(sentences)

In [None]:
tar_tokenizer = Tokenizer(lower=False)
tar_tokenizer.fit_on_texts(ner_tags)

In [None]:
tar_tokenizer.word_index

In [None]:
vocab_size = len(src_tokenizer.word_index) + 1
tag_size = len(tar_tokenizer.word_index) + 1
print('단어 집합의 크기 : {}'.format(vocab_size))
print('개체명 태깅 정보 집합의 크기 : {}'.format(tag_size))

In [None]:
X_train = src_tokenizer.texts_to_sequences(sentences)
y_train = tar_tokenizer.texts_to_sequences(ner_tags)

In [None]:
print(X_train[0])
print(y_train[0])

In [None]:
index_to_word = src_tokenizer.index_word
index_to_ner = tar_tokenizer.index_word

In [None]:
decoded = []
for index in X_train[0]:
    decoded.append(index_to_word[index])

print('기존 문장 : {}'.format(sentences[0]))
print('복원 문장 : {}'.format(decoded))

In [None]:
max_len = 27
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.3, random_state=777)

In [None]:
y_train.shape, y_test.shape

In [None]:
y_train = to_categorical(y_train, num_classes=tag_size)
y_test = to_categorical(y_test, num_classes=tag_size)

In [None]:
y_test.shape

In [None]:
print('훈련 샘플 문장의 크기 : {}'.format(X_train.shape))
print('훈련 샘플 레이블의 크기 : {}'.format(y_train.shape))
print('테스트 샘플 문장의 크기 : {}'.format(X_test.shape))
print('테스트 샘플 레이블의 크기 : {}'.format(y_test.shape))

In [None]:
embedding_matrix = np.zeros((vocab_size, 50))
np.shape(embedding_matrix)

In [None]:
def get_vector(word):
    return embed([word])

In [None]:
for i, word in src_tokenizer.index_word.items():
    temp = get_vector(word)
    if temp is not None:
        embedding_matrix[i] = temp
        
    if i % 50 == 0:
        print(i)

In [None]:
index_to_word[16]

In [None]:
get_vector('2만km')

In [None]:
embedding_matrix.shape

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, TimeDistributed
from tensorflow.keras.optimizers import Adam

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, weights=[embedding_matrix], output_dim=50, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])

In [None]:
%%time
model.fit(X_train, y_train, batch_size=128, epochs=50,  validation_data=(X_test, y_test))

In [None]:
model.evaluate(X_test, y_test)

In [None]:
i=33 # 확인하고 싶은 테스트용 샘플의 인덱스.
y_predicted = model.predict(np.array([X_test[i]])) # 입력한 테스트용 샘플에 대해서 예측 y를 리턴
y_predicted = np.argmax(y_predicted, axis=-1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경함.
true = np.argmax(y_test[i], -1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경함.

print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")

for word, real, pred in zip(X_test[i], true, y_predicted[0]):
    if word != 0: # PAD값은 제외함.
        print("{:17}: {:7} {}".format(index_to_word[word], 
                                      index_to_ner[real].upper(),
                                      index_to_ner[pred].upper()))

In [None]:
results = []
for word, pred in zip(X_test[i], y_predicted[0]):
    if word != 0 and index_to_ner[pred] != 'O':
        results.append([index_to_word[word], index_to_ner[pred]])
results

In [None]:
pretag = "init"
result = {'PART':"",
          'OPT':"",
          'COND':"",
          'JOB':""}

idx = 31

pred = model.predict(np.array([X_test[idx]]))
pred = np.argmax(pred, axis=-1)
test = [index_to_word[x] for x in X_test[idx] if x != 0]

for x, y in zip(X_test[idx], pred[0]):
    if x != 0 and index_to_ner[y] != 'O':
        word, tag = index_to_word[x], index_to_ner[y]
        
        if tag[0] == 'B':
            if pretag[0] == 'I' and pretag[2:] == tag[2:]:
                result[tag[2:]] = result[tag[2:]] + ', ' + word
            else:
                result[tag[2:]] += word
        elif tag[0] == 'I':
            result[tag[2:]] += ' ' + word
        
        pretag = tag
             
print("원문\t: {}".format(' '.join(test)))
print("--"*50)
print("소모품\t: {}\n스펙\t: {}\n점검주기\t: {}\n작업구분\t: {}".format(result['PART'], result['OPT'], result['COND'], result['JOB']))