In [1]:
import sys
import tensorflow as tf
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split

import json

In [2]:

## 미리 Global 변수를 지정하자. 파일 명, 파일 위치, 디렉토리 등이 있다.

DATA_IN_PATH = './data/'
DATA_OUT_PATH = './output/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
DATA_CONFIGS = 'data_configs.json'

## 학습에 필요한 파라메터들에 대해서 지정하는 부분이다.

BATCH_SIZE = 16
EPOCH = 2
HIDDEN = 64
BUFFER_SIZE = 10000

NUM_LAYERS = 3
DROPOUT_RATIO = 0.3

TEST_SPLIT = 0.1
RNG_SEED = 13371447
EMBEDDING_DIM = 128
MAX_SEQ_LEN = 31

In [3]:
q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = None

with open(DATA_IN_PATH + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)

In [4]:
VOCAB_SIZE = prepro_configs['vocab_size']

In [5]:
q1_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q1_data])
q2_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q2_data])

In [6]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)

train_Q1 = train_X[:,0]
train_Q2 = train_X[:,1]
test_Q1 = test_X[:,0]
test_Q2 = test_X[:,1]

In [7]:
def rearrange(base, hypothesis, labels):
    features = {"base": base, "hypothesis": hypothesis}
    return features, labels

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_Q1, train_Q2, train_y))
    dataset = dataset.shuffle(buffer_size=len(train_Q1))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(rearrange)
    dataset = dataset.repeat(EPOCH)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_Q1, test_Q2, test_y))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(rearrange)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

# 모델

In [8]:
def Malstm(features, labels, mode):
        
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
            
    embedding = tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
    
    base_embedded_matrix = embedding(features['base'])
    hypothesis_embedded_matrix = embedding(features['hypothesis'])
    
    # Question 1에 대한 LSTM모델
    q_lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = 64, activation = tf.nn.tanh)
    q_lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = 64, activation = tf.nn.tanh)
    _, q_output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw = q_lstm_fw_cell,
                                                       cell_bw = q_lstm_bw_cell,
                                                       inputs = base_embedded_matrix,
                                                       dtype = tf.float32,
                                                      scope='query')
    # 마지막 state 값을 뽑아 추출한다
    q_final_state = tf.concat([q_output_states[0].h, q_output_states[1].h], axis=1)

    # Question 2 (유사한 쿼리)에 대한 LSTM모델
    s_lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = 64, activation = tf.nn.tanh)
    s_lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = 64, activation = tf.nn.tanh)
    _, s_output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw = s_lstm_fw_cell,
                                                       cell_bw = s_lstm_bw_cell,
                                                       inputs = hypothesis_embedded_matrix,
                                                        dtype = tf.float32,
                                                      scope='sim_query')
    #LSTM의 마지막 state 값을 추출한다.
    sim_final_state = tf.concat([s_output_states[0].h, s_output_states[1].h], axis=1)
        
#     merged_matrix = tf.concat([base_sementic_matrix, hypothesis_sementic_matrix], -1)
#     logit_layer = tf.keras.layers.dot([base_sementic_matrix, hypothesis_sementic_matrix], axes=1, normalize=True)    

    with tf.variable_scope('output_layer'):
#     logit_layer = K.exp(-K.sum(K.abs(base_sementic_matrix - hypothesis_sementic_matrix), axis=1, keepdims=True))
        logit_layer = tf.exp(-tf.reduce_sum(tf.abs(q_final_state - sim_final_state), axis=1, keepdims=True))
        logit_layer = tf.squeeze(logit_layer, axis=-1)
                
    if PREDICT:
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions={
                      'is_duplicate':logit_layer
                  })
    
    #prediction 진행 시, None
    if labels is not None:
        labels = tf.to_float(labels)
    
#     loss = tf.reduce_mean(tf.keras.metrics.binary_crossentropy(y_true=labels, y_pred=logit_layer))
    loss = tf.losses.mean_squared_error(labels=labels, predictions=logit_layer)
#     loss = tf.reduce_mean(tf.losses.sigmoid_cross_entropy(labels, logit_layer))
    
    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(logit_layer))
        eval_metric_ops = {'acc': accuracy}
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  eval_metric_ops= eval_metric_ops,
                  loss=loss)

    elif TRAIN:

        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  train_op=train_op,
                  loss=loss)

# 학습

In [9]:
# os.environ["CUDA_VISIBLE_DEVICES"]="0" #For TEST  

model_dir = os.path.join(os.getcwd(), DATA_OUT_PATH + "/checkpoint/rnn/")
os.makedirs(model_dir, exist_ok=True)

config_tf = tf.estimator.RunConfig(save_checkpoints_steps=500,
                                save_checkpoints_secs=None,
                                  keep_checkpoint_max=2,
                                  log_step_count_steps=200)

lstm_est = tf.estimator.Estimator(Malstm, model_dir=model_dir, config=config_tf)

INFO:tensorflow:Using config: {'_model_dir': 'C:\\Python\\Python36\\venv\\kmu\\Scripts\\nlp\\Text_Similarlity\\./output//checkpoint/rnn/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 2, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 200, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002090B5232E8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [10]:
lstm_est.train(train_input_fn)

INFO:tensorflow:Calling model_fn.
Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ckpt.
INFO:tensorflow:loss = 0.5030184, step = 1
INFO:tensorflow:global_step/sec: 5.89941
INFO:tensorflow:loss = 0.2681255, step = 201 (33.899 sec)
INFO:tensorflow:global_step/sec: 8.6311
INFO:tensorflow:loss = 0.23085478, step = 401 (23.149 sec)
INFO:tensorflow:Saving checkpoints for 500 into C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ckpt.
INFO:ten

INFO:tensorflow:global_step/sec: 7.38771
INFO:tensorflow:loss = 0.18339112, step = 9601 (27.063 sec)
INFO:tensorflow:global_step/sec: 8.72672
INFO:tensorflow:loss = 0.12953924, step = 9801 (22.919 sec)
INFO:tensorflow:Saving checkpoints for 10000 into C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ckpt.
INFO:tensorflow:global_step/sec: 7.32517
INFO:tensorflow:loss = 0.19235086, step = 10001 (27.303 sec)
INFO:tensorflow:global_step/sec: 8.71088
INFO:tensorflow:loss = 0.121314116, step = 10201 (22.960 sec)
INFO:tensorflow:global_step/sec: 8.70047
INFO:tensorflow:loss = 0.22056133, step = 10401 (22.990 sec)
INFO:tensorflow:Saving checkpoints for 10500 into C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ckpt.
INFO:tensorflow:global_step/sec: 7.60396
INFO:tensorflow:loss = 0.18044555, step = 10601 (26.299 sec)
INFO:tensorflow:global_step/sec: 8.71929
INFO:tensorflow:loss = 0.1911127, step = 10801 (22.937 sec)


INFO:tensorflow:global_step/sec: 7.48997
INFO:tensorflow:loss = 0.16629721, step = 20001 (26.702 sec)
INFO:tensorflow:global_step/sec: 8.61627
INFO:tensorflow:loss = 0.09363664, step = 20201 (23.212 sec)
INFO:tensorflow:global_step/sec: 8.61095
INFO:tensorflow:loss = 0.17112634, step = 20401 (23.229 sec)
INFO:tensorflow:Saving checkpoints for 20500 into C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ckpt.
INFO:tensorflow:global_step/sec: 7.58603
INFO:tensorflow:loss = 0.11421372, step = 20601 (26.363 sec)
INFO:tensorflow:global_step/sec: 8.55933
INFO:tensorflow:loss = 0.11906939, step = 20801 (23.366 sec)
INFO:tensorflow:Saving checkpoints for 21000 into C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ckpt.
INFO:tensorflow:global_step/sec: 7.43163
INFO:tensorflow:loss = 0.23069921, step = 21001 (26.912 sec)
INFO:tensorflow:global_step/sec: 8.61233
INFO:tensorflow:loss = 0.01756022, step = 21201 (23.222 sec

INFO:tensorflow:loss = 0.081227034, step = 30401 (23.278 sec)
INFO:tensorflow:Saving checkpoints for 30500 into C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ckpt.
INFO:tensorflow:global_step/sec: 7.55624
INFO:tensorflow:loss = 0.13224488, step = 30601 (26.468 sec)
INFO:tensorflow:global_step/sec: 8.57573
INFO:tensorflow:loss = 0.08951283, step = 30801 (23.322 sec)
INFO:tensorflow:Saving checkpoints for 31000 into C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ckpt.
INFO:tensorflow:global_step/sec: 7.36232
INFO:tensorflow:loss = 0.21951394, step = 31001 (27.164 sec)
INFO:tensorflow:global_step/sec: 8.59007
INFO:tensorflow:loss = 0.12414428, step = 31201 (23.284 sec)
INFO:tensorflow:global_step/sec: 8.59992
INFO:tensorflow:loss = 0.13856888, step = 31401 (23.259 sec)
INFO:tensorflow:Saving checkpoints for 31500 into C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ck

<tensorflow.python.estimator.estimator.Estimator at 0x2097e900eb8>

In [11]:
lstm_est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-06-29-20:22:17
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ckpt-33586
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-06-29-20:22:34
INFO:tensorflow:Saving dict for global step 33586: acc = 0.787827, global_step = 33586, loss = 0.1491001
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 33586: C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ckpt-33586


{'acc': 0.787827, 'loss': 0.1491001, 'global_step': 33586}

# 테스트

In [12]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'))

In [13]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"base":test_q1_data, 
                                                         "hypothesis":test_q2_data}, 
                                                      shuffle=False)
predictions = np.array([p['is_duplicate'] for p in lstm_est.predict(input_fn=predict_input_fn)])

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./output//checkpoint/rnn/model.ckpt-33586
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.


In [14]:
print(len(predictions)) #2345796

output = pd.DataFrame( data={"test_id":test_id_data, "is_duplicate": list(predictions)} )
output.to_csv( "./data/rnn_predict.csv", index=False, quoting=3 )

2345796


### 캐글에서 0.45025으로 2265등 정도 나온다