# 2.1.3 ~ 2.1.4 Estimator & Tensorboard

In [3]:
import os
import tensorflow as tf #텐서플로우 모듈 불러오기

from tensorflow.keras.datasets import imdb #imdb 한글 데이터셋을 불러온다
from tensorflow import keras #전처리를 위한 processing 기능이다.

import numpy as np

from sklearn.model_selection import train_test_split

  return f(*args, **kwds)


# 초기 기본 데이터를 불러옵니다.

  - Estimator의 설명을 위해, 데이터 로드 및 기본 기능 (모델 구조)는 https://www.tensorflow.org/tutorials/keras/basic_text_classification
    를 참고하였습니다.

In [4]:
VOCAB_SIZE = 10000 #문장의 단어 사이즈
SENT_SIZE = 256 #문장 길이
BATCH_SIZE = 128
EMB_SIZE = 128
NUM_EPOCHS = 100

PAD_ID = 0 # 200단어 이하 문장에 대해서 0 값을 채워 넣는다.
START_ID = 1 # 시작 id 값
OOV_ID = 2 # out of vocab
INDEX_OFFSET = 2


# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()} 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

#IMDB 데이터셋을 로드 합니다. 학습과 테스트 셋으로 나눕니다.
(train_data, train_label), (eval_data, eval_label) = imdb.load_data(num_words=VOCAB_SIZE,
                                                      start_char=START_ID,
                                                      oov_char=OOV_ID,
                                                      index_from=INDEX_OFFSET)

# 각 문장의 길이를 200으로 정하고, 256개가 안되는 문장에 대해서는 패딩(0)값으로 채워줍니다.

train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=SENT_SIZE)
eval_data = keras.preprocessing.sequence.pad_sequences(eval_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=SENT_SIZE)

train_label = train_label.astype('float32').reshape(-1, 1)
eval_label = eval_label.astype('float32').reshape(-1, 1)

train_len = np.array([min(len(x), SENT_SIZE) for x in train_data])
eval_len = np.array([min(len(x), SENT_SIZE) for x in eval_data])

print("train_input shape:", train_data.shape)
print("eval_input shape:", eval_data.shape)

train_input shape: (25000, 256)
eval_input shape: (25000, 256)


# Index to Sentence 변환

In [5]:
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

decode_review(train_data[0])

"<START> i but that or viewer apart shocking impression only 5 embarrassment story merit <UNUSED> same an woman the not too or dance many some mostly <UNK> to by history true the watching <UNUSED> few many makes <UNK> you're mean so <UNUSED> few reads state was highly who it called <UNUSED> can't some that and i'm murdered for i but <UNUSED> judge questionable seemed <UNUSED> but can first in that or viewer who we this it dad <UNUSED> but was favorite was in that whose as <UNK> the my came in of during of characters the <UNUSED> send primitive that history story sloppy all <UNUSED> your in that who exactly the not show there an these has not bother all and but in gets his much what the i human that do <UNK> of <UNUSED> watch acting profession this woman <UNUSED> <UNK> a carpenter the entertainment an can or viewer \x96 you doing everyone it's a <UNUSED> <UNK> spoilers it after great <UNUSED> start this watched make one genuinely very you go and take northam as <UNUSED> there's but movi

In [6]:
def mapping_fn(X, Y=None):
    input, label = {'x': X}, Y
    return input, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_data, train_label))
    dataset = dataset.shuffle(buffer_size=len(train_data))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(count=NUM_EPOCHS)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((eval_data, eval_label))
    dataset = dataset.shuffle(buffer_size=len(eval_input))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)

    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [9]:
def model_fn(features, labels, mode):
    
    """
    Model is from official website: https://www.tensorflow.org/tutorials/keras/basic_text_classification
    """

    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    input_layer = keras.layers.Embedding(VOCAB_SIZE, EMB_SIZE)(features['x'])
    lstm = keras.layers.LSTM(32)(input_layer)
    dense_layer = keras.layers.Dense(16, activation=tf.nn.relu)(lstm)
    output_layer = keras.layers.Dense(1, activation=tf.nn.sigmoid)(dense_layer)
    
#     print(flatten)
#     print(dense_layer)
#     print(output_layer)
#     print(labels)

    if PREDICT:
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions={
                      'pos_neg': output_layer
                  })

    loss = tf.reduce_mean(keras.metrics.binary_crossentropy(y_true=labels, y_pred=output_layer))
    
    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(output_layer))
        eval_metric_ops = {'acc': accuracy}
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  eval_metric_ops= eval_metric_ops,
                  loss=loss)
    
    tf.summary.scalar('loss', loss)
    
    if TRAIN:
        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  train_op=train_op,
                  loss=loss)

In [None]:
model_dir = os.path.join(os.getcwd(), "data_out/checkpoint/tutorial/")
os.makedirs(model_dir, exist_ok=True)

config_tf = tf.estimator.RunConfig()
config_tf._save_checkpoints_secs = 100
config_tf._keep_checkpoint_max =  2
config_tf._log_step_count_steps = 100

model_basic = tf.estimator.Estimator(model_fn, model_dir=model_dir, config=config_tf)
model_basic.train(train_input_fn) #학습하기

INFO:tensorflow:Using config: {'_model_dir': '/Users/sinseongjin/github/DeepNLP/7.NLPBOOK/2.NLP_PREP/data_out/checkpoint/tutorial/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 100, '_session_config': None, '_keep_checkpoint_max': 2, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11f48ba20>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/sinseongjin/github/DeepNLP/7.NLPBOOK/2.NLP_PREP/data_out/checkpoint/tutorial/model.ckpt-0
