In [1]:
import dill
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

# Load Vectors
from gensim.models import KeyedVectors

# Utility
import numpy as np
import time

# Model Utility
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd

# Keras Model
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from keras.utils import to_categorical
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
# from keras_contrib.layers import CRF
from keras.callbacks import ModelCheckpoint

In [2]:
from pythainlp import word_vector

In [3]:
with open('datatrain.data', 'rb') as file:
    datatofile = dill.load(file)
    
tagged_sents = []
for i in datatofile:
    text_inside = []
    for j in i:
        text_inside.append((j[0],j[2]))
    tagged_sents.append(text_inside)
    
train_sents, test_sents= train_test_split(tagged_sents, test_size=0.2, random_state=112)
print(len(train_sents))
print(len(test_sents))
print(train_sents[1])

4918
1230
[('ขณะเดียวกัน', 'O'), (' ', 'O'), ('แหล่งข่าว', 'O'), ('จาก', 'O'), ('องค์การอนามัยโลก', 'B-ORGANIZATION'), ('เปิดเผย', 'O'), ('ว่า', 'O'), (' ', 'O'), ('เมื่อ', 'O'), ('สัปดาห์', 'O'), ('ก่อน', 'O'), (' ', 'O'), ('องค์การอนามัยโลก', 'B-ORGANIZATION'), ('ขอ', 'O'), ('ประวัติการ', 'O'), ('ป่วย', 'O'), ('ของ', 'O'), ('เด็กหญิง', 'O'), ('ชาวฮ่องกง', 'O'), ('อายุ', 'O'), (' ', 'O'), ('8', 'O'), (' ', 'O'), ('ขวบ', 'O'), ('ที่', 'O'), ('เสียชีวิต', 'O'), ('ที่', 'O'), ('เมือง', 'B-LOCATION'), ('ผิง', 'I-LOCATION'), ('ถัง', 'I-LOCATION'), (' ', 'O'), ('มณฑล', 'B-LOCATION'), ('ฟู', 'I-LOCATION'), ('เจี้ยน', 'I-LOCATION'), (' ', 'O'), ('เมื่อ', 'O'), ('เดือน', 'O'), ('ก.พ.', 'B-DATE'), ('ปี', 'I-DATE'), (' ', 'I-DATE'), ('2546', 'I-DATE'), (' ', 'O'), ('โดย', 'O'), ('เด็กหญิง', 'O'), ('เสียชีวิต', 'O'), ('ขณะ', 'O'), ('มา', 'O'), ('เยี่ยม', 'O'), ('ญาติ', 'O'), (' ', 'O'), ('และ', 'O'), ('อีก', 'O'), (' ', 'O'), ('12', 'B-TIME'), (' ', 'I-TIME'), ('วัน', 'I-TIME'), ('ต่อมา', 'O'), (

In [4]:
thai2fit_model = word_vector.get_model()

In [5]:
word_list=[]
ner_list=[]
thai2dict = {}

for sent in train_sents:
    for word in sent:
        word_list.append(word[0])
        ner_list.append(word[1])
        
for word in thai2fit_model.index2word:
    thai2dict[word] = thai2fit_model[word]

word_list.append("pad")
word_list.append("unknown") #Special Token for Unknown words ("UNK")
ner_list.append("pad")

all_words = sorted(set(word_list))
all_ner = sorted(set(ner_list))
all_thai2dict = sorted(set(thai2dict))

word_to_ix = dict((c, i) for i, c in enumerate(all_words)) #convert word to index 
ner_to_ix = dict((c, i) for i, c in enumerate(all_ner)) #convert ner to index
thai2dict_to_ix = dict((c, i) for i, c in enumerate(thai2dict)) #convert thai2fit to index 

ix_to_word = dict((v,k) for k,v in word_to_ix.items()) #convert index to word
ix_to_ner = dict((v,k) for k,v in ner_to_ix.items())  #convert index to ner
ix_to_thai2dict = dict((v,k) for k,v in thai2dict_to_ix.items())  #convert index to thai2fit

n_word = len(word_to_ix)
n_tag = len(ner_to_ix)
n_thai2dict = len(thai2dict_to_ix)
print(n_word)
print(n_tag)
print(n_thai2dict)
print(ner_to_ix)

13084
27
51358
{'B-DATE': 0, 'B-EMAIL': 1, 'B-LAW': 2, 'B-LEN': 3, 'B-LOCATION': 4, 'B-MONEY': 5, 'B-ORGANIZATION': 6, 'B-PERCENT': 7, 'B-PERSON': 8, 'B-PHONE': 9, 'B-TIME': 10, 'B-URL': 11, 'B-ZIP': 12, 'I-DATE': 13, 'I-EMAIL': 14, 'I-LAW': 15, 'I-LEN': 16, 'I-LOCATION': 17, 'I-MONEY': 18, 'I-ORGANIZATION': 19, 'I-PERCENT': 20, 'I-PERSON': 21, 'I-PHONE': 22, 'I-TIME': 23, 'I-URL': 24, 'O': 25, 'pad': 26}


In [6]:
print(ner_to_ix)

{'B-DATE': 0, 'B-EMAIL': 1, 'B-LAW': 2, 'B-LEN': 3, 'B-LOCATION': 4, 'B-MONEY': 5, 'B-ORGANIZATION': 6, 'B-PERCENT': 7, 'B-PERSON': 8, 'B-PHONE': 9, 'B-TIME': 10, 'B-URL': 11, 'B-ZIP': 12, 'I-DATE': 13, 'I-EMAIL': 14, 'I-LAW': 15, 'I-LEN': 16, 'I-LOCATION': 17, 'I-MONEY': 18, 'I-ORGANIZATION': 19, 'I-PERCENT': 20, 'I-PERSON': 21, 'I-PHONE': 22, 'I-TIME': 23, 'I-URL': 24, 'O': 25, 'pad': 26}


In [7]:
chars = set([w_i for w in thai2dict for w_i in w])
char2idx = {c: i + 5 for i, c in enumerate(chars)}

char2idx["pad"] = 0
char2idx["unknown"] = 1
char2idx[" "] = 2

char2idx["$"] = 3
char2idx["#"] = 4
char2idx["!"] = 5
char2idx["%"] = 6
char2idx["&"] = 7
char2idx["*"] = 8
char2idx["+"] = 9
char2idx[","] = 10
char2idx["-"] = 11
char2idx["."] = 12
char2idx["/"] = 13
char2idx[":"] = 14
char2idx[";"] = 15
char2idx["?"] = 16
char2idx["@"] = 17
char2idx["^"] = 18
char2idx["_"] = 19
char2idx["`"] = 20
char2idx["="] = 21
char2idx["|"] = 22
char2idx["~"] = 23
char2idx["'"] = 24
char2idx['"'] = 25

char2idx["("] = 26
char2idx[")"] = 27
char2idx["{"] = 28
char2idx["}"] = 29
char2idx["<"] = 30
char2idx[">"] = 31
char2idx["["] = 32
char2idx["]"] = 33

n_chars = len(char2idx)
print(n_chars)

404


In [8]:
max_len = 250
max_len_char = 30

character_LSTM_unit = 32
char_embedding_dim = 32
main_lstm_unit = 256 ## Bidirectional 256 + 256 = 512
lstm_recurrent_dropout = 0.5

train_batch_size = 32
train_epochs = 50

In [9]:
def prepare_sequence_word(input_text):
    idxs = list()
    for word in input_text:
        if word in thai2dict:
            idxs.append(thai2dict_to_ix[word])
        else:
            idxs.append(thai2dict_to_ix["unknown"]) #Use UNK tag for unknown word
    return idxs

def prepare_sequence_target(input_label):
    idxs = [ner_to_ix[w] for w in input_label]
    return idxs

In [10]:
input_sent =[ [ word[0] for word in sent]for sent in train_sents ] #words only
train_targets =[ [ word[1] for word in sent]for sent in train_sents ] #NER only

input_test_sent =[ [ word[0] for word in sent]for sent in test_sents ] #words only
test_targets =[ [ word[1] for word in sent]for sent in test_sents ] #NER only

In [11]:
print(train_targets)

[['B-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TIME', 'I-TIME', 'I-TIME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TIME', 'I-TIME', 'I-TIME'], ['O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'O', 'O'

In [12]:
print(input_sent[0])

['คีร์กิซ', 'สถาน', 'ใช้', 'มาตรการ', 'จับตาย', 'เพื่อ', 'หยุด', 'จลาจล']


In [13]:
print(train_sents[0])

[('คีร์กิซ', 'B-LOCATION'), ('สถาน', 'I-LOCATION'), ('ใช้', 'O'), ('มาตรการ', 'O'), ('จับตาย', 'O'), ('เพื่อ', 'O'), ('หยุด', 'O'), ('จลาจล', 'O')]


In [15]:
X_word_tr = [prepare_sequence_word(s) for s in input_sent]
X_word_tr = pad_sequences(maxlen=max_len, sequences=X_word_tr, value=thai2dict_to_ix["pad"], padding='post', truncating='post')

## Character Training
X_char_tr = []
for sentence in train_sents:
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                if(sentence[i][0][j] in char2idx):
                    word_seq.append(char2idx.get(sentence[i][0][j]))
                else:
                    word_seq.append(char2idx.get("unknown"))
            except:
                word_seq.append(char2idx.get("pad"))
#         print(sent_seq)
        sent_seq.append(word_seq)
    X_char_tr.append(np.array(sent_seq))

## Sequence Label Training
y_tr = [prepare_sequence_target(s) for s in train_targets]
y_tr = np.array(y_tr)
print(y_tr.shape)
y_tr = pad_sequences(maxlen=max_len, sequences=y_tr, value=ner_to_ix["pad"], padding='post', truncating='post')
print(y_tr.shape)
y_tr = [to_categorical(i, num_classes=n_tag) for i in y_tr]

(4918,)
(4918, 250)


In [16]:
y_tr = np.array(y_tr)
print(len(y_tr[0][0]))
print(y_tr.shape)

27
(4918, 250, 27)


In [None]:
for i in y_tr[0]:
    print(i)

In [None]:
print(X_word_tr.shape)

In [None]:
X_char_tr = np.array(X_char_tr)
print(X_char_tr.shape)
# print(X_char_tr[0])
for i in X_char_tr[0]:
    print(i)

In [None]:
x = np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))

In [None]:
print(x.shape)

In [None]:
print(y_tr[0][0])
print(len(y_tr[0][0]))