In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2"

import tensorflow as tf
from keras import layers
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras import backend as K
from keras.models import Model
import keras

Using TensorFlow backend.


In [2]:
def int_sentence(left_texts, right_texts, vocab_index):
    left_int = []
    right_int = []

    for i in range(len(left_texts)):
        left_etc = []
        right_etc = []
        for j in range(len(left_texts[i])):
            left_etc.append(vocab_index[left_texts[i][j]])
        for j in range(len(right_texts[i])):
            right_etc.append(vocab_index[right_texts[i][j]])
        left_int.append(left_etc)
        right_int.append(right_etc)
        
    return left_int, right_int

In [3]:
def manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

In [4]:
All = open('file_name_1', 'r', encoding='utf-8-sig')   # train, cv, test가 모두 있는 파일
train = open('file_name_2', 'r', encoding='utf-8-sig')
cv = open('file_name_3', 'r', encoding='utf-8-sig')
test = open('file_name_4', 'r', encoding='utf-8-sig')

In [5]:
all_sentence = []

for line in All:
    line = line.split('\t')
    all_sentence.append(line[0].split())
    all_sentence.append(line[1].split())

max_len = max([len(i) for i in all_sentence])

vocab = set()
for line in all_sentence:
    for word in line:
        vocab.add(word)

vocab_size = len(vocab)+1

vocab = sorted(list(vocab))

vocab_index = {}
for i in range(len(vocab)):
    vocab_index[vocab[i]] = len(vocab_index)+1

In [6]:
embed_model_dir = 'embedding_pretrain_file'

embedding_index = dict()

with open(embed_model_dir, 'r', encoding='utf-8-sig') as f:
    for line in f:
        # Exception to first line in word2vec model.
        if len(line.split()) == 2:
            continue
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vectors
    f.close()
    print('--- Finished make embedding index ---')
print('Found %s word vectors.' % len(embedding_index))

EMBEDDING_DIM = 300

embedded_matrix = np.zeros((len(vocab_index)+1, EMBEDDING_DIM))
embed_cnt = 0

for word, i in vocab_index.items():
    embedded_vector = embedding_index.get(word)
    if embedded_vector is not None:
        embedded_matrix[i] = embedded_vector
        embed_cnt += 1

print('Created Embedded Matrix: %s word vectors.' % embed_cnt)

--- Finished make embedding index ---
Found 12178 word vectors.
Created Embedded Matrix: 12178 word vectors.


In [7]:
embedded_layer = layers.Embedding(len(vocab_index)+1, EMBEDDING_DIM, 
    weights=[embedded_matrix], input_length=max_len, trainable=False)

train_left_sen = []
train_right_sen = []
train_label = []

for line in train:
    line = line.split('\t')
    train_left_sen.append(line[0].strip().split())
    train_right_sen.append(line[1].strip().split())
    train_label.append(float(line[2].strip()))

cv_left_sen = []
cv_right_sen = []
cv_label = []

for line in cv:
    line = line.split('\t')
    cv_left_sen.append(line[0].strip().split())
    cv_right_sen.append(line[1].strip().split())
    cv_label.append(float(line[2].strip()))

test_left_sen = []
test_right_sen = []
test_label = []

for line in test:
    line = line.split('\t')
    test_left_sen.append(line[0].strip().split())
    test_right_sen.append(line[1].strip().split())
    test_label.append(float(line[2].strip()))

train_left_int, train_right_int = int_sentence(train_left_sen, train_right_sen, vocab_index)
cv_left_int, cv_right_int = int_sentence(cv_left_sen, cv_right_sen, vocab_index)
test_left_int, test_right_int = int_sentence(test_left_sen, test_right_sen, vocab_index)

train_left = pad_sequences(train_left_int, padding='post', maxlen=max_len)
train_right = pad_sequences(train_right_int, padding='post', maxlen=max_len)

cv_left = pad_sequences(cv_left_int, padding='post', maxlen=max_len)
cv_right = pad_sequences(cv_right_int, padding='post', maxlen=max_len)

test_left = pad_sequences(test_left_int, padding='post', maxlen=max_len)
test_right = pad_sequences(test_right_int, padding='post', maxlen=max_len)

In [8]:
left_input = layers.Input(shape=(max_len,))
right_input = layers.Input(shape=(max_len,))

left_emd = embedded_layer(left_input)
right_emd = embedded_layer(right_input)

lstm = layers.LSTM(50)

left_lstm = lstm(left_emd)
right_lstm = lstm(right_emd)

distance = layers.Lambda(function=lambda x: manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_lstm, right_lstm])

optimizer = keras.optimizers.Adadelta(clipnorm=1.25)  # paper optimizers

model = Model(inputs=[left_input, right_input], outputs=[distance])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 69)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 69)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 69, 300)      3849300     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 50)           70200       embedding_1[0][0]                
          

In [None]:
model = keras.utils.multi_gpu_model(model, gpus=3)
model.compile(loss="mean_squared_error", optimizer=optimizer, metrics=['accuracy'])

es = keras.callbacks.EarlyStopping(monitor='val_loss',mode='min', verbose=1, patience=3, restore_best_weights=True)

model_fit = model.fit([train_left, train_right], [train_label], batch_size=64, epochs=100, validation_data=([cv_left, cv_right], [cv_label]), callbacks=[es])

In [None]:
evaluation = model.evaluate([test_left, test_right], [test_label])

print('Accuracy: '+str(evaluation[1]))
print('Loss: '+str(evaluation[0]))