In [13]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split

import json

In [14]:
DATA_IN_PATH = './data/'
DATA_OUT_PATH = './output/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy' # 기준이 되는 문장
TRAIN_Q2_DATA_FILE = 'train_q2.npy' # 기준 문장과 비교할 대상이 되는 문장
TRAIN_LABEL_DATA_FILE = 'train_label.npy' # 단어사전
DATA_CONFIGS = 'data_configs.json' # 단어 사전의 크기값을 가지고 있는 파일

TEST_SPLIT = 0.1
RNG_SEED = 13371447

In [15]:
EPOCH=1
BATCH_SIZE=1024

MAX_SEQUENCE_LENGTH = 31

WORD_EMBEDDING_DIM = 100
CONV_FEATURE_DIM = 300
CONV_OUTPUT_DIM = 128
CONV_WINDOW_SIZE = 3
SIMILARITY_DENSE_FEATURE_DIM = 200

prepro_configs = None

with open(DATA_IN_PATH + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)
    
VOCAB_SIZE = prepro_configs['vocab_size']

In [16]:
q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))

In [17]:
q1_data

array([[  70, 1047,   47, ...,    0,    0,    0],
       [  57,    4,   76, ...,    0,    0,    0],
       [  16,   60,   18, ...,    0,    0,    0],
       ...,
       [   4,   21,    7, ...,    0,    0,    0],
       [   2,   21, 7735, ...,    0,    0,    0],
       [   9,   15,  302, ...,    0,    0,    0]])

In [18]:
q2_data

array([[  16,    3, 3394, ...,    0,    0,    0],
       [   4,    9,   15, ...,    0,    0,    0],
       [   2,   36,    1, ...,    0,    0,    0],
       ...,
       [   4,   11,  135, ...,    0,    0,    0],
       [   2,   21, 7735, ...,    0,    0,    0],
       [   3,   19,  240, ...,    0,    0,    0]])

In [19]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
train_X, eval_X, train_y, eval_y = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)

train_Q1 = train_X[:,0]
train_Q2 = train_X[:,1]
eval_Q1 = eval_X[:,0]
eval_Q2 = eval_X[:,1]

# 데이터 입력 함수

In [20]:
# map 함수
def rearrange(base, hypothesis, label):         # rearrange(기준 질문, 비교 대상 질문, 라벨값)
    features = {"x1": base, "x2": hypothesis}   # 2개의 질문을 하나의 딕셔너리 형태의 입력값으로 만든다.
    return features, label                      # 딕셔너리와 라벨을 리턴하는 구조 

# 학습 입력 함수
def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_Q1, train_Q2, train_y))
    dataset = dataset.shuffle(buffer_size=10000)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(rearrange)
    dataset = dataset.repeat(EPOCH)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

# 검증 입력 함수
def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((eval_Q1, eval_Q2, eval_y))
    dataset = dataset.shuffle(buffer_size=10000)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(rearrange)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()


## 두 입력 함수의 차이점은 검증 입력 함수는 EPOCH만큼 반복하지 않는다.

# 모델

In [21]:
# CNN 블록 함수 정의 * 합성곱 신경망과 풀링, DENSE를 하나로 합친 형태로 정의

def basic_conv_sementic_network(inputs, name):
    conv_layer = tf.keras.layers.Conv1D(CONV_FEATURE_DIM, 
                                        CONV_WINDOW_SIZE, 
                                        activation=tf.nn.relu, 
                                        name=name + 'conv_1d',
                                        padding='same')(inputs)

    max_pool_layer = tf.keras.layers.MaxPool1D(MAX_SEQUENCE_LENGTH, 
                                               1)(conv_layer)

    output_layer = tf.keras.layers.Dense(CONV_OUTPUT_DIM, 
                                         activation=tf.nn.relu,
                                         name=name + 'dense')(max_pool_layer)
    output_layer = tf.squeeze(output_layer, 1)
    
    return output_layer

In [22]:
def model_fn(features, labels, mode):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    embedding = tf.keras.layers.Embedding(VOCAB_SIZE,
                                          WORD_EMBEDDING_DIM)
    
    base_embedded_matrix = embedding(features['x1'])
    hypothesis_embedded_matrix = embedding(features['x2'])
    
    base_embedded_matrix = tf.keras.layers.Dropout(0.2)(base_embedded_matrix)
    hypothesis_embedded_matrix = tf.keras.layers.Dropout(0.2)(hypothesis_embedded_matrix)  
    
    base_sementic_matrix = basic_conv_sementic_network(base_embedded_matrix, 'base')
    hypothesis_sementic_matrix = basic_conv_sementic_network(hypothesis_embedded_matrix, 'hypothesis')  
    
    merged_matrix = tf.concat([base_sementic_matrix, hypothesis_sementic_matrix], -1)

    similarity_dense_layer = tf.keras.layers.Dense(SIMILARITY_DENSE_FEATURE_DIM,
                                             activation=tf.nn.relu)(merged_matrix)
    # 유사도 측정을 위해 두 벡터에 대한 코사인 유사도 점수나 유클리디안 거리 점수를 활용할 수 있다.
    
    similarity_dense_layer = tf.keras.layers.Dropout(0.2)(similarity_dense_layer)    
    logit_layer = tf.keras.layers.Dense(1)(similarity_dense_layer)
    logit_layer = tf.squeeze(logit_layer, 1)
    similarity = tf.nn.sigmoid(logit_layer)
    
    if PREDICT:
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions={
                      'is_duplicate':similarity
                  })
    # 예측값으로 측정한 유사도 값을 딕셔너리 형태로 전달
    
    loss = tf.losses.sigmoid_cross_entropy(labels, logit_layer)
    # 손실값 계산

    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(similarity))
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  eval_metric_ops= {'acc': accuracy},
                  loss=loss)
    # 정확도 값을 리턴하기 위해 값을 계산
    # 측정한 유사도 값에 round 함수를 사용해 반올림하면 0혹인 1 값을 가지는데 이 값과 라벨을 비교해서 정확도를 측정
    # 이렇게 측정한 정확도와 손실값을 리턴
    
    if TRAIN:
        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  train_op=train_op,
                  loss=loss)
    # 학습 상태인 경우의 함수를 리턴
    # 학습 상태에는 단순히 손실값과 정확도를 측정하는 것보다 모델을 학습시켜서 가중치를 최적화해야 하기 때문에
    # tf.train.AdamOptimizer를 생성 후 손실값을 적용해야 한다.

# 학습

In [23]:
#os.environ["CUDA_VISIBLE_DEVICES"]="6" #For TEST GPU

model_dir = os.path.join(os.getcwd(), DATA_IN_PATH + "checkpoint/cnn/")
os.makedirs(model_dir, exist_ok=True)

est = tf.estimator.Estimator(model_fn, model_dir=model_dir)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Python\\Python36\\venv\\kmu\\Scripts\\nlp\\Text_Similarlity\\./data/checkpoint/cnn/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001C18E994C18>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [24]:
est.train(train_input_fn) #train

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./data/checkpoint/cnn/model.ckpt.
INFO:tensorflow:loss = 0.69342566, step = 1
INFO:tensorflow:global_step/sec: 1.19153
INFO:tensorflow:loss = 0.54236335, step = 101 (83.939 sec)
INFO:tensorflow:global_step/sec: 1.30482
INFO:tensorflow:loss = 0.48890194, step = 201 (76.611 sec)
INFO:tensorflow:Saving checkpoints for 263 into C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./data/checkpoint/cnn/model.ckpt.
INFO:tensorflow:Loss for final step: 0.46422404.


<tensorflow.python.estimator.estimator.Estimator at 0x1c18e994eb8>

In [25]:
est.evaluate(eval_input_fn) #eval

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-06-29-18:55:35
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./data/checkpoint/cnn/model.ckpt-263
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-06-29-18:55:41
INFO:tensorflow:Saving dict for global step 263: acc = 0.75429606, global_step = 263, loss = 0.49808544
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 263: C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./data/checkpoint/cnn/model.ckpt-263


{'acc': 0.75429606, 'loss': 0.49808544, 'global_step': 263}

# 테스트

In [26]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'))

In [27]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x1":test_q1_data,
                                                         "x2":test_q2_data},
                                                      shuffle=False)

predictions = np.array([p['is_duplicate'] for p in est.predict(input_fn=predict_input_fn)])

output = pd.DataFrame( data={"test_id":test_id_data, "is_duplicate": list(predictions)} )
output.to_csv("./output/cnn_predict.csv", index=False, quoting=3)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Python\Python36\venv\kmu\Scripts\nlp\Text_Similarlity\./data/checkpoint/cnn/model.ckpt-263
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.


### 캐글에서 0.52047으로 2572등(private) 정도 나온다.