In [5]:
import csv
import numpy as np
import MeCab
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from keras.layers import Dense,LSTM,Activation,Dropout,Input
from keras.models import Sequential,Model

test_max_len = 0
X_MAX_LEN = 25
Y_MAX_LEN = 8
MECAB = MeCab.Tagger("-Owakati")
SIZE_OF_W2V = 200
SIZE_OF_TARGET = 11
W2V = KeyedVectors.load_word2vec_format('../data/embeddings/entity_vector.model.bin', binary=True)

LOOK_UP_TABLE = {'0':0, '1':1, '2':2, '3':3, '4':4, '5':5, '6':6, '7':7, '8':8, '9':9, '-':10}
def prepare_y(y):
    y_input = []
    for ele in y:
        tmp = [0] * SIZE_OF_TARGET
        tmp[LOOK_UP_TABLE[ele]] = 1
        y_input.append(tmp)
        
    y_target = y_input[1:]
    y_target.append([0] * SIZE_OF_TARGET)
    return y_input, y_target

def get_embeddings(token):
    try:
        return W2V[token]
    except:
        return [0] * SIZE_OF_W2V

def prepare_x(x):
    tmp = []
    tokens = MECAB.parse(x).split(" ")
    
    global test_max_len
    test_max_len = max(test_max_len, len(tokens))
    
    for token in tokens:
        if token is not "\n":
            tmp.append(get_embeddings(token))
    
    # zero padding    
    for i in range(0, X_MAX_LEN - len(tokens) + 1):
        tmp.append([0] * SIZE_OF_W2V)
    
    return tmp
    
x_data = []
y_data = []
y_data_tar = []

with open('../data/training/addresses.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for i, row in enumerate(reader):
        if i > 50000:
            break
            
        x_data.append(prepare_x(row["x"]))
        y_input, y_target = prepare_y(row["y"])
        y_data.append(y_input)
        y_data_tar.append(y_target)
        
x_data = np.array(x_data)
y_data = np.array(y_data)
y_data_tar = np.array(y_data_tar)

# x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)
# _, _, y_train_tar, y_test_tar = train_test_split(x_data, y_data_tar, test_size=0.2)

print(x_data.shape)
print(y_data.shape)
print(y_data_tar.shape)
print(test_max_len)

(50001, 25, 200)
(50001, 8, 11)
(50001, 8, 11)
23


In [6]:
LATENT_DIM = 256

encoder_inputs = Input(shape=(None, SIZE_OF_W2V))
encoder = LSTM(LATENT_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

print(state_h)
print(state_c)

Tensor("lstm_3/while/Exit_2:0", shape=(?, 256), dtype=float32)
Tensor("lstm_3/while/Exit_3:0", shape=(?, 256), dtype=float32)


In [7]:
decoder_inputs = Input(shape=(None, SIZE_OF_TARGET))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(SIZE_OF_TARGET, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

print(decoder_outputs)

Tensor("dense_2/truediv:0", shape=(?, ?, 11), dtype=float32)


In [8]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([x_data, y_data], 
          y_data_tar,
          batch_size=1,
          epochs=15,
          validation_split=0.2)

model.save('a2z.h5')

Train on 40000 samples, validate on 10001 samples
Epoch 1/15
 2767/40000 [=>............................] - ETA: 54:32 - loss: 1.4205

KeyboardInterrupt: 