# 패션 MNIST
1. 패션 MNIST 데이터셋을 적재하고 학습셋, 검증셋, 테스트셋으로 나누기
2. 학습셋을 섞은 다음 각 데이터셋을 TFRecord 파일로 저장하기. 각 레코드는 두 개의 특성을 가진 Example 프로토콜 버퍼, 즉 직렬화된 이미지(tf.io.serialize_tensor()를 사용)와 레이블임.
3. tf.data API를 사용해서 각 세트를 위한 효율적인 데이터셋을 만들기
4. 이 데이터셋으로 입력 특성을 표준화하는 전처리층을 포함한 케라스 모델을 학습하기
5. 텐서보드로 프로파일 데이터를 시각화하여 가능한 한 입력 파이트라인을 효율적으로 만들기

---
## 1. 학습셋, 검증셋, 테스트셋으로 나누기

In [1]:
import tensorflow as tf
from tensorflow import keras

In [2]:
mnist = keras.datasets.fashion_mnist.load_data()

In [3]:
from sklearn.model_selection import train_test_split

(x_train_full, y_train_full), (x_test, y_test) = mnist
x_train, y_train = x_train_full[: 50000], y_train_full[: 50000]
x_val, y_val = x_train_full[50000: ], y_train_full[50000: ]

In [4]:
train_set = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(len(x_train))
valid_set = tf.data.Dataset.from_tensor_slices((x_val, y_val))
test_set = tf.data.Dataset.from_tensor_slices((x_test, y_test))

---
## 2. 학습셋을 섞은 후 각 데이터셋을 TFRecord 파일로 저장하기

In [25]:
import os
from tensorflow.train import Example, Features, Feature
from tensorflow.train import BytesList, Int64List

target_dir = "./mnist_tfrecords"
if not os.path.exists(target_dir):
    os.mkdir(target_dir)
else:
    pass


def get_img_example_serial(img, label):
    img_data = tf.io.serialize_tensor(img)
    img_example = Example(
    features = Features(
        feature={
            "label": Feature(int64_list = Int64List(value=[label.numpy()])),
            "img": Feature(bytes_list = BytesList(value=[img_data.numpy()]))
        }))
    serialized_example = img_example.SerializeToString()
    return serialized_example

def get_tfrecord_paths(data_set):
    paths = []
    for id, (img, label) in train_set.enumerate():
        file_name = "train_{:05d}.tfrecord".format(id)
        file_path = os.path.join(target_dir, file_name)
        paths.append(file_path)
    return paths

train_paths = get_tfrecord_paths(train_set)
valid_path = get_tfrecord_paths(valid_set)
test_path = get_tfrecord_paths(test_set)
'''for id, (img, label) in train_set.enumerate():
    img_example = get_img_example_serial(img, label)
    file_name = "train_{:05d}.tfrecord".format(id)
    file_path = os.path.join(target_dir, file_name)
    
    with tf.io.TFRecordWriter(file_path) as f:
        f.write(img_example)'''

'for id, (img, label) in train_set.enumerate():\n    img_example = get_img_example_serial(img, label)\n    file_name = "train_{:05d}.tfrecord".format(id)\n    file_path = os.path.join(target_dir, file_name)\n    \n    with tf.io.TFRecordWriter(file_path) as f:\n        f.write(img_example)'

In [6]:
'''for id, (img, label) in valid_set.enumerate():
    img_example = get_img_example_serial(img, label)
    file_name = "valid_{:05d}.tfrecord".format(id)
    file_path = os.path.join(target_dir, file_name)
    
    with tf.io.TFRecordWriter(file_path) as f:
        f.write(img_example)'''

'for id, (img, label) in valid_set.enumerate():\n    img_example = get_img_example_serial(img, label)\n    file_name = "valid_{:05d}.tfrecord".format(id)\n    file_path = os.path.join(target_dir, file_name)\n    \n    with tf.io.TFRecordWriter(file_path) as f:\n        f.write(img_example)'

In [7]:
'''for id, (img, label) in test_set.enumerate():
    img_example = get_img_example_serial(img, label)
    file_name = "test_{:05d}.tfrecord".format(id)
    file_path = os.path.join(target_dir, file_name)
    
    with tf.io.TFRecordWriter(file_path) as f:
        f.write(img_example)'''

'for id, (img, label) in test_set.enumerate():\n    img_example = get_img_example_serial(img, label)\n    file_name = "test_{:05d}.tfrecord".format(id)\n    file_path = os.path.join(target_dir, file_name)\n    \n    with tf.io.TFRecordWriter(file_path) as f:\n        f.write(img_example)'

---
## 3. tf.data API로 효율적인 데이터셋 만들기

In [29]:
feature_description = {
    "img": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "label": tf.io.FixedLenFeature([], tf.int64, default_value=0)
}

def preprocess(tfrecord):
    example = tf.io.parse_single_example(tfrecord, feature_description)
    img = tf.io.parse_tensor(example["img"], out_type=tf.uint8)
    img = tf.reshape(img, shape=[28, 28])
    return img, example["label"]

In [68]:
def get_dataset_from_paths(filepaths, n_read_threads=5, shuffle_buffer_size=None,
                          n_parse_threads=5, batch_size=256, cache=True):
    dataset = tf.data.TFRecordDataset(filepaths, 
                                     num_parallel_reads=n_read_threads)
    if cache:
        dataset = dataset.cache()
    if shuffle_buffer_size:
        dataset = dataset.shuffle(shuffle_buffer_size)
    
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

In [69]:
train_set = get_dataset_from_paths(train_paths, shuffle_buffer_size=60000)
valid_set = get_dataset_from_paths(valid_path)
test_set = get_dataset_from_paths(test_path)

---
## 4. 전처리층을 포함한 케라스 모델 학습하기

In [70]:
import numpy as np

std_layer = keras.layers.experimental.preprocessing.Normalization()
sample_features = train_set.map(lambda img, label: img)
sample_features = np.concatenate(list(sample_features.as_numpy_iterator()), axis=0).astype(np.float32)
print(sample_features.shape)
std_layer.adapt(sample_features)

(50000, 28, 28)


In [71]:
model = keras.models.Sequential([
    keras.layers.Input(shape=[28, 28]),
    std_layer,
    keras.layers.Flatten(),
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
    keras.layers.Dense(10, activation="softmax")
])

In [72]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

In [73]:
from datetime import datetime
logs = os.path.join(os.curdir, "my_logs",
                    "run_" + datetime.now().strftime("%Y%m%d_%H%M%S"))

tensorboard_cb = tf.keras.callbacks.TensorBoard(
    log_dir = logs)
early_cb = tf.keras.callbacks.EarlyStopping(patience=10)

model.fit(train_set, epochs=100, validation_data=valid_set, callbacks=[tensorboard_cb, early_cb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x22b1171e640>