In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from keras.layers import Dense,LSTM,Activation,Dropout,Input
from keras.models import Model

MAX_LEN = 63
SIZE_OF_W2V = 200
W2V = KeyedVectors.load_word2vec_format('../data/embeddings/entity_vector.model.bin', binary=True)


Using TensorFlow backend.


In [40]:
def prepare_y(y):
    if y == "B-MONEY":
        return [1,0,0]
    elif y == "I-MONEY":
        return [0,1,0]
    else:
        return [0,0,1]

def translate_y(pred):
    idx = pred.index(max(pred))
    if idx == 0:
        return "B-MONEY"
    elif idx == 1:
        return "I-MONEY"
    else:
        return "O"
    
def get_embeddings(token):
    try:
        return W2V[token]
    except:
        return [0] * SIZE_OF_W2V

x_data = []
y_data = []
x_text = []

x_tmp = []
x_text_tmp = []
y_tmp = []

def prepare(x, y):
    global x_tmp
    global y_tmp
    global x_text_tmp
    if x is not "":
        x_tmp.append(get_embeddings(x))
        x_text_tmp.append(x)
        y_tmp.append(prepare_y(y))
    else:
        for i in range(0, MAX_LEN - len(x_tmp)):
            x_tmp.append([0] * SIZE_OF_W2V)
            x_text_tmp.append("0")
            y_tmp.append(prepare_y("O"))
        
        x_data.append(x_tmp)
        x_text.append(x_text_tmp)
        y_data.append(y_tmp)
        
        x_tmp = []
        x_text_tmp = []
        y_tmp = []

with open('../data/training/ner.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        prepare(row["x"], row["y"])
        
x_data = np.array(x_data)
y_data = np.array(y_data)
x_text = np.array(x_text)

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, shuffle=False)
x_t_train, x_t_test, _, _ = train_test_split(x_text, y_data, test_size=0.3, shuffle=False)

print(x_train.shape)
print(y_train.shape)
print(x_t_train.shape)

(10, 63, 200)
(10, 63, 3)
(10, 63)


In [44]:
def create_model():
    inputs = Input(name='in_layer', shape=(MAX_LEN, SIZE_OF_W2V))
    layer = LSTM(
        units=128, 
        input_shape=(MAX_LEN, SIZE_OF_W2V), 
        return_sequences=True,
        recurrent_dropout=0.15,
        activation='sigmoid')(inputs)
    layer = Dense(64)(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.15)(layer)
    layer = Dense(3)(layer)
    layer = Activation('softmax')(layer)
    
    model = Model(inputs=inputs, outputs=layer)
    return model

model = create_model()
model.summary()
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=1, epochs=25, verbose=1)

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
in_layer (InputLayer)        (None, 63, 200)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 63, 128)           168448    
_________________________________________________________________
dense_7 (Dense)              (None, 63, 64)            8256      
_________________________________________________________________
activation_7 (Activation)    (None, 63, 64)            0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 63, 64)            0         
_________________________________________________________________
dense_8 (Dense)              (None, 63, 3)             195       
_________________________________________________________________
activation_8 (Activation)    (None, 63, 3)             0   

<keras.callbacks.callbacks.History at 0x17eb41610>

In [45]:
accr = model.evaluate(x_test, y_test)
print(f"Loss: {accr[0]}, Accuracy: {accr[1]}")

Loss: 0.00658702477812767, Accuracy: 0.9968253970146179


In [55]:
prediction = model.predict(x_test)
for s_x, s_pred, s_y in zip(x_t_test, prediction, y_test):
    for w_x, w_pred, w_y in zip(s_x, s_pred, s_y):
        print(f"{w_x}\t\t{translate_y(w_pred.tolist())}\t\t{translate_y(w_y.tolist())}")
        

平野		O		O
は		O		O
2017		O		O
年		O		O
オフ		O		O
に		O		O
2		O		O
年		O		O
600		B-MONEY		B-MONEY
万		I-MONEY		I-MONEY
ドル		I-MONEY		I-MONEY
（		O		O
約		O		O
6		B-MONEY		B-MONEY
億		I-MONEY		I-MONEY
4000		I-MONEY		I-MONEY
万		I-MONEY		I-MONEY
円		I-MONEY		I-MONEY
）		O		O
で		O		O
オリックス		O		O
から		O		O
ダイヤモンド		O		O
バックス		O		O
へ		O		O
移籍		O		O
。		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
日本ハム		O		O
の		O		O
上原		O		O
健太		O		O
投手		O		O
が		O		O
29		O		O
日		O		O
、		O		O
札幌		O		O
の		O		O
球団		O		O
事務所		O		O
で		O		O
契約		O		O
更改		O		O
し		O		O
、		O		O
現状		O		O
維持		O		O
の		O		O
1700		B-MONEY		B-MONEY
万		I-MONEY		I-MONEY
円		I-MONEY		I-MONEY
で		O		O
サイン		O		O
し		O		O
た		O		O
（		O		O
金額		O		O
は		O		O
推定		O		O
）		O		O
。		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O		O
0		O	