In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import os
import json

In [2]:
DATA_IN_PATH = './data/'
DATA_OUT_PATH = './output/'

TRAIN_INPUT_DATA = 'train_input.npy'
TRAIN_LABEL_DATA = 'train_label.npy'
DATA_CONFIGS = 'data_configs.json'

In [3]:
input_data = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
label_data = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'rb'))
prepro_configs = None

with open(DATA_IN_PATH + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)

In [4]:
prepro_configs['vocab_size']

74065

## 학습, 검증용 데이터 분할

In [5]:
TEST_SPLIT = 0.1
RANDOM_SEED = 13371447

train_input, eval_input, train_label, eval_label = train_test_split(input_data, label_data, 
                                                                    test_size=TEST_SPLIT, random_state=RANDOM_SEED)

## 데이터 입력 함수 구현

In [6]:
BATCH_SIZE = 16
NUM_EPOCHS = 3

def mapping_fn(X, Y):
    inputs, labels = {'x': X}, Y # 데이터 X에대해 'x'라는 이름을 mapping하여 딕셔너리 구조로 사용
    return inputs, labels
# 모델에 따라 입력값이 하나가 아니라 두개 이상이 될 수도 있다.
# 라벨을 제외한 나머지 데이터를 하나의 입력값으로 묶기 위해 mapping 과정을 거친다

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_input, train_label)) # 주어진 데이터를 묶어서 조각으로 만들고 함께 사용하게 함.
    dataset = dataset.shuffle(buffer_size=50000) # 데이터가 섞인다.
    dataset = dataset.batch(BATCH_SIZE) # 배치 사이즈를 정한다.
    dataset = dataset.repeat(count=NUM_EPOCHS) # Epoch를 설정
    dataset = dataset.map(mapping_fn) # 데이터를 맵핑
    iterator = dataset.make_one_shot_iterator() # 데이터를 하나씩 사용할 수 있게 해준다.
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((eval_input, eval_label)) 
    dataset = dataset.map(mapping_fn)
    dataset = dataset.batch(BATCH_SIZE)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

## 모델 구현 함수

In [7]:
VOCAB_SIZE = prepro_configs['vocab_size']+1
WORD_EMBEDDING_DIM = 100
HIDDEN_STATE_DIM = 150
DENSE_FEATURE_DIM = 150

learning_rate = 0.001

In [8]:
print(len(prepro_configs['vocab']), VOCAB_SIZE)

74065 74066


In [9]:
def model_fn(features, labels, mode):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    embedding_layer = tf.keras.layers.Embedding(
                    VOCAB_SIZE,
                    WORD_EMBEDDING_DIM)(features['x'])
    # 모델에서 데이터를 받게 된다면 시퀀스 형태로 데이터가 입력
    # 처음으로 embedding을 진행
    
    embedding_layer = tf.keras.layers.Dropout(0.2)(embedding_layer)
    # Dropout층을 설정
    
    rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [HIDDEN_STATE_DIM, HIDDEN_STATE_DIM]]
    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
    # LSTM 모델을 구현하기 위한 코드
    # LSTMCell을 생성할 때는 은닉 상태 벡터에 대한 차원만 정의하면 된다.
    # 여러 LSTMCell을 쌓게 되면 이를 하나의 MultiRNN으로 묶어야 한다(Wrapping)

    outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
                                       inputs=embedding_layer,
                                       dtype=tf.float32)
    # 위에서 만든 RNNCell 객체는 시퀀스 한 스텝에 대한 연산만 가능, for문을 통해 여러 연산을 할 수 있게 구현해야한다.
    # dynamic_rnnn함수는 for문 없이 순환 신경망을 만들어 주는 역할
    
    outputs = tf.keras.layers.Dropout(0.2)(outputs)
    hidden_layer = tf.keras.layers.Dense(DENSE_FEATURE_DIM, activation=tf.nn.tanh)(outputs[:,-1,:])
    # LSTM 신경망의 마지막 출력값을 덴스층에 추가 하기 위하여 output[:, -1, :]을 사용
    hidden_layer = tf.keras.layers.Dropout(0.2)(hidden_layer)
    logits = tf.keras.layers.Dense(1)(hidden_layer)
    # 긍정 / 부정으로 결과를 뽑기 위하여 마지막 층은 1개의 결과값만 나와야 한다(이진분류)
    logits = tf.squeeze(logits, axis=-1)
    
    sigmoid_logits = tf.nn.sigmoid(logits)
    
    if PREDICT:
        predictions = {'sentiment': sigmoid_logits}
        
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions=predictions)
    
    loss = tf.losses.sigmoid_cross_entropy(labels, logits)
    
    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(sigmoid_logits))
        eval_metric_ops = {'acc': accuracy}

        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)
    
    if TRAIN:
        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  train_op=train_op,
                  loss=loss)

## 학습 및 검증

In [10]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

est = tf.estimator.Estimator(model_fn=model_fn,
                             model_dir=DATA_OUT_PATH + 'checkpoint/rnn') 
# 모델을 작성한 함수를 통해 에스티메이터 객체 생성

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './output/checkpoint/rnn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000020FCEB7DC88>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


## 훈련 데이터

In [11]:
os.environ["CUDA_VISIBLE_DEVICES"]="4"
est.train(train_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./output/checkpoint/rnn\model.ckpt.
INFO:tensorflow:loss = 0.6906093, step = 1
INFO:tensorflow:global_step/sec: 3.05105
INFO:tensorflow:loss = 0.6660559, step = 101 (32.776 sec)
INFO:tensorflow:global_step/sec: 3.127
INFO:tensorflow:loss = 0.6946002, step = 201 (31.979 sec)
INFO:tensorflow:global_step/sec: 3.12066
INFO:tensorflow:loss = 0.69460976, step = 301 (32.045 sec)
INFO:tensorflow:global_step/sec: 3.15233
INFO:tensorflow:loss = 0.6882217, step = 401 (31.723 sec)
INFO:tensorflow:global_step/sec: 3.10551
INFO:tensorflow:loss = 0.6806458, step = 501 (32.201 sec)
INFO:tensorflow:global_step/sec: 3.13182
INFO:tensorflow:loss = 0.6909116, step = 601 (31.930 sec)
INFO:tensorflow:global_step/sec: 3.20435
INFO

<tensorflow.python.estimator.estimator.Estimator at 0x20fceb7dc18>

## 검증 데이터

In [12]:
est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-06-29-11:26:27
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./output/checkpoint/rnn\model.ckpt-4221
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-06-29-11:26:39
INFO:tensorflow:Saving dict for global step 4221: acc = 0.8468, global_step = 4221, loss = 0.34527427
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 4221: ./output/checkpoint/rnn\model.ckpt-4221


{'acc': 0.8468, 'loss': 0.34527427, 'global_step': 4221}

## 테스트 데이터

In [13]:
DATA_OUT_PATH = './output/'
TEST_INPUT_DATA = 'test_input.npy'
TEST_ID_DATA = 'test_id.npy'

test_input_data = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb'))

In [14]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x":test_input_data}, shuffle=False)

In [15]:
predictions = np.array([p['sentiment'] for p in est.predict(input_fn=
predict_input_fn)])

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./output/checkpoint/rnn\model.ckpt-4221
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.


In [16]:
test_id = np.load(open(DATA_IN_PATH + TEST_ID_DATA, 'rb'))

In [17]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

output = pd.DataFrame(data={"id": list(test_id), "sentiment":list(predictions)} )
output.to_csv(DATA_OUT_PATH + 'movie_review_result_rnn.csv', index=False, quoting=3 )

### 캐글에서 0.92585점으로 220등 정도의 성적(public)