# 훈련 루프 처음부터 작성하기

In [3]:
!pip install -U tf-hub-nightly




[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

### Keras의 기본 학습 및 평가 루프

Keras는 기본 학습 및 평가 루프인 `fit()`, `evaluate()`을 제공

- 일반적으로 `fit()`은 Model 클래스를 하위 클래스로 만들고, 반복적으로 호출되는 고유한 `train_step()` 메서드를 구현함
- 그런데, 훈련 및 평가에 대한 매우 낮은 수준의 제어를 원하면 자체 훈련 및 평가 루프를 처음부터 작성해야 함

## GradientTape 사용하기: 첫 번째 엔드 투 엔드 예시

`GradientTape` 범위 내에서 모델 호출 시, 손실 값과 관련하여 레이어의 학습가능한 가중치의 그래디언트를 가져올 수 있음


- 옵티마이저 인스턴스 : 변수 업데이트 가능 (`model.trainable_weights`를 사용해 가져올 수 있음)
- 예) MNIST 모델

In [5]:
inputs = keras.Input(shape=(784,), name="digits")
x1 = layers.Dense(64, activation="relu")(inputs)
x2 = layers.Dense(64, activation="relu")(x1)
outputs = layers.Dense(10, name="predictions")(x2)
model = keras.Model(inputs=inputs, outputs=outputs)

사용자 정의 학습 루프가 있는 미니 배치 그래디언트를 사용하여 모델 훈련

- 옵티마이저, 손실 함수, 데이터세트 선언

In [7]:
# Instantiate an optimizer.
optimizer = keras.optimizers.SGD(learning_rate=1e-3)
# Instantiate a loss function.
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Prepare the training dataset.
batch_size = 64
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
x_train = np.reshape(x_train, (-1, 784))
x_test = np.reshape(x_test, (-1, 784))

# Reserve 10,000 samples for validation.
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]

# Prepare the training dataset.
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

# Prepare the validation dataset.
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)

학습 루프 과정

- Epoch를 반복하는 `for` 루프를 연다
- 각 epoch에 대해 데이터세트를 배치 단위로 반복하는 `for` 루프를 연다
- 각 배치에 대해 `GradientTape()` 범위를 연다
- 이 범위 내에서 모델(순방향 전달)을 호출하고 손실을 계산한다
- 범위 외부에서 손실에 대한 모델 가중치의 그래디언트를 검색한다
- 마지막으로 옵티마이저를 사용하여 그래디언트를 기반으로 모델의 가중치를 업데이트한다

In [8]:
epochs = 2
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    
    # 데이터셋의 배치에 대해 반복
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):

        # 각 배치에 대해 GradientTape() 범위를 엶 (자동 미분 가능)
        with tf.GradientTape() as tape:

            # 이 범위 내에서 모델 (순방향 전달) 호출하고, GradientTape에서 기록
            logits = model(x_batch_train, training=True)  # Logits for this minibatch
            # 미니배치 동안의 손실 계산
            loss_value = loss_fn(y_batch_train, logits)

        # 범위 외부에서 손실에 대한 모델 가중치의 그래디언트 검색
        grads = tape.gradient(loss_value, model.trainable_weights)
        # 옵티마이저를 사용해 그래디언트를 기반으로 모델 가중치 업데이트
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        # Log every 200 batches.
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %s samples" % ((step + 1) * batch_size))


Start of epoch 0
Training loss (for one batch) at step 0: 181.3130
Seen so far: 64 samples
Training loss (for one batch) at step 200: 1.1341
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 1.4639
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.8485
Seen so far: 38464 samples

Start of epoch 1
Training loss (for one batch) at step 0: 1.2408
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.9118
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.8196
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.5297
Seen so far: 38464 samples


## 메트릭 로우 레벨(low-level) 처리

기본 루프에 메트릭 모니터링 추가 (내장 메트릭은 쉽게 재사용 가능)

- 루프 시작 시 메트릭 인스턴스 화
- 각 배치 후에 `metric.update_state()`를 호출
- 메트릭의 현재 값을 표시해야 하는 경우 {code 0}matric.result(){/code 0}를 호출
- 메트릭의 상태를 삭제해야 할 경우(일반적으로 Epoch 종료 시) `metric.reset_states()`를 호출

In [9]:
# Get model
inputs = keras.Input(shape=(784,), name="digits")
x = layers.Dense(64, activation="relu", name="dense_1")(inputs)
x = layers.Dense(64, activation="relu", name="dense_2")(x)
outputs = layers.Dense(10, name="predictions")(x)
model = keras.Model(inputs=inputs, outputs=outputs)

# Instantiate an optimizer to train the model.
optimizer = keras.optimizers.SGD(learning_rate=1e-3)
# Instantiate a loss function.
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Prepare the metric. ★
train_acc_metric = keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = keras.metrics.SparseCategoricalAccuracy()

학습 및 평가 루프

In [10]:
import time

epochs = 2
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        
        # 미분을 위한 GradientTape 적용
        with tf.GradientTape() as tape:
            # 1) 예측
            logits = model(x_batch_train, training=True)
            # 2) loss 계산
            loss_value = loss_fn(y_batch_train, logits)
        # 3) 그래디언트 계산
        grads = tape.gradient(loss_value, model.trainable_weights)
        # 4) 오차역전파 - weight 업데이트
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        
        # 각 배치 후에 metric.update_state 호출하여 training metric 업데이트
        train_acc_metric.update_state(y_batch_train, logits)

        # Log every 200 batches.
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %d samples" % ((step + 1) * batch_size))

    # Display metrics at the end of each epoch.
    train_acc = train_acc_metric.result()
    print("Training acc over epoch: %.4f" % (float(train_acc),))

    # 각 에포크가 끝나면 training metrics는 삭제 (reset_states())
    train_acc_metric.reset_states()

    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_dataset:
        val_logits = model(x_batch_val, training=False)
        # Update val metrics
        val_acc_metric.update_state(y_batch_val, val_logits)
    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print("Validation acc: %.4f" % (float(val_acc),))
    print("Time taken: %.2fs" % (time.time() - start_time))


Start of epoch 0
Training loss (for one batch) at step 0: 122.8096
Seen so far: 64 samples
Training loss (for one batch) at step 200: 1.6723
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 1.1608
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.5826
Seen so far: 38464 samples
Training acc over epoch: 0.7046
Validation acc: 0.8200
Time taken: 22.03s

Start of epoch 1
Training loss (for one batch) at step 0: 0.6578
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.5854
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.4829
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.4323
Seen so far: 38464 samples
Training acc over epoch: 0.8344
Validation acc: 0.8666
Time taken: 21.89s


## tf.function으로 학습 단계 가속화

TensorFlow 2의 기본 런타임은 '즉시 실행'

**이점: 디버깅에 유용, 그래프 컴파일에 유용**

- 계산을 정적 그래프로 설명하여 프레임워크로 전역 성능 최적화 적용 가능
- 단, 프레임워크가 다음에 무엇이 올지 모르는 상태로 탐욕적으로 하나씩 실행하도록 할 때는 불가능

**사용법: `@tf.function` 데코레이터를 추가하면 끝!**

In [11]:
@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        logits = model(x, training=True)
        loss_value = loss_fn(y, logits)
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    train_acc_metric.update_state(y, logits)
    return loss_value

In [12]:
@tf.function
def test_step(x, y):
    val_logits = model(x, training=False)
    val_acc_metric.update_state(y, val_logits)

컴파일된 학습 단계로 학습 루프 다시 실행

In [13]:
import time

epochs = 2
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        loss_value = train_step(x_batch_train, y_batch_train)

        # Log every 200 batches.
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %d samples" % ((step + 1) * batch_size))

    # Display metrics at the end of each epoch.
    train_acc = train_acc_metric.result()
    print("Training acc over epoch: %.4f" % (float(train_acc),))

    # Reset training metrics at the end of each epoch
    train_acc_metric.reset_states()

    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_dataset:
        test_step(x_batch_val, y_batch_val)

    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print("Validation acc: %.4f" % (float(val_acc),))
    print("Time taken: %.2fs" % (time.time() - start_time))


Start of epoch 0
Training loss (for one batch) at step 0: 0.4079
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.6054
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.6789
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.9066
Seen so far: 38464 samples
Training acc over epoch: 0.8679
Validation acc: 0.8804
Time taken: 3.75s

Start of epoch 1
Training loss (for one batch) at step 0: 0.5085
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.3017
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.9116
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.5044
Seen so far: 38464 samples
Training acc over epoch: 0.8849
Validation acc: 0.8913
Time taken: 2.40s


## 모델에서 추적한 손실의 로우 레벨 처리

`self.add_loss(value)` : 호출하는 레이어로 순방향 전달을 수행하는 동안 생성된 손실을 재귀적으로 추적

`model.losses` : Scalar 손실 값의 결과 목록을 알 수 있음

이러한 손실 구성 요소를 사용하려면 이들을 종합한 후, 학습 단계의 기본 손실에 추가해야 함

In [14]:
# 활동 정규화 손실 생성 레이어
class ActivityRegularizationLayer(layers.Layer):
    def call(self, inputs):
        self.add_loss(1e-2 * tf.reduce_sum(inputs))
        return inputs

In [15]:
inputs = keras.Input(shape=(784,), name="digits")
x = layers.Dense(64, activation="relu")(inputs)
# Insert activity regularization as a layer
x = ActivityRegularizationLayer()(x)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(10, name="predictions")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [16]:
@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        logits = model(x, training=True)
        loss_value = loss_fn(y, logits)
        # Add any extra losses created during the forward pass.
        loss_value += sum(model.losses)
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    train_acc_metric.update_state(y, logits)
    return loss_value

## 엔드 투 엔드 예제: GAN 학습 루프 처음부터 수행하기

### GAN

이미지 학습 데이터세트의 잠재 분포(이미지의 "잠재 공간")를 훈련하여 거의 실제처럼 보이는 새로운 이미지를 생성할 수 있음

잠재 공간의 지점을 이미지 공간의 지점으로 매핑하는 "생성기" 모델<br>실제 이미지(학습 데이터 세트)와 가짜 이미지(생성기 네트워크의 출력물)를 구별할 수 있는 분류자인 "판별기" 모델의 두 부분으로 구성

**GAN 학습 루프**

- 판별기 훈련
    - 잠재 공간에 무작위 지점 배치 샘플링
    - 생성기 모델로 지점을 가짜 이미지로 바꿈
    - 실제 이미지 배치를 가져와 생성된 이미지와 결합
    - 생성된 이미지와 실제 이미지 분류를 위한 판별기 모델 훈련


- 생성기 훈련
    - 앞의 1~3단계는 판별기와 동일
    - 생성기 모델을 훈련시켜 판별기를 속이고, 가짜 이미지를 진짜로 분류

*가짜 숫자와 실제 숫자를 구분하기 위한 판별기 생성*

In [17]:
discriminator = keras.Sequential(
    [
        keras.Input(shape=(28, 28, 1)),
        layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.GlobalMaxPooling2D(),
        layers.Dense(1),
    ],
    name="discriminator",
)
discriminator.summary()

Model: "discriminator"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 14, 14, 64)        640       
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 14, 14, 64)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 7, 7, 128)         73856     
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 7, 7, 128)         0         
                                                                 
 global_max_pooling2d (Glob  (None, 128)               0         
 alMaxPooling2D)                                                 
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                     

*잠재 벡터를 형태 (28, 28, 1)(MNIST 숫자를 나타냄)의 출력으로 바꾸는 생성기 네트워크 생성*

In [18]:
latent_dim = 128

generator = keras.Sequential(
    [
        keras.Input(shape=(latent_dim,)),
        # We want to generate 128 coefficients to reshape into a 7x7x128 map
        layers.Dense(7 * 7 * 128),
        layers.LeakyReLU(alpha=0.2),
        layers.Reshape((7, 7, 128)),
        layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(1, (7, 7), padding="same", activation="sigmoid"),
    ],
    name="generator",
)

*훈련 루프*

In [19]:
# Instantiate one optimizer for the discriminator and another for the generator
# 판별기 모델 (Discriminative), 생성기 (Generative) 모델
d_optimizer = keras.optimizers.Adam(learning_rate=0.0003)
g_optimizer = keras.optimizers.Adam(learning_rate=0.0004)

# Instantiate a loss function.
loss_fn = keras.losses.BinaryCrossentropy(from_logits=True)


@tf.function
def train_step(real_images):
    # 잠재 공간에 무작위 지점 배치 샘플링
    random_latent_vectors = tf.random.normal(shape=(batch_size, latent_dim))
    # 가짜 이미지로 바꿈
    generated_images = generator(random_latent_vectors)
    # 실제 이미지와 생성 이미지 결합
    combined_images = tf.concat([generated_images, real_images], axis=0)

    # Assemble labels discriminating real from fake images
    labels = tf.concat(
        [tf.ones((batch_size, 1)), tf.zeros((real_images.shape[0], 1))], axis=0
    )
    # Add random noise to the labels - important trick!
    labels += 0.05 * tf.random.uniform(labels.shape)

    # Train the discriminator (판별기 모델 훈련)
    with tf.GradientTape() as tape:
        predictions = discriminator(combined_images)
        d_loss = loss_fn(labels, predictions)
    grads = tape.gradient(d_loss, discriminator.trainable_weights)
    d_optimizer.apply_gradients(zip(grads, discriminator.trainable_weights))

    # Sample random points in the latent space
    random_latent_vectors = tf.random.normal(shape=(batch_size, latent_dim))
    # Assemble labels that say "all real images"
    misleading_labels = tf.zeros((batch_size, 1))

    # Train the generator (판별기 모델의 가중치는 업데이트 하지 않음)
    with tf.GradientTape() as tape:
        predictions = discriminator(generator(random_latent_vectors))
        g_loss = loss_fn(misleading_labels, predictions)
    grads = tape.gradient(g_loss, generator.trainable_weights)
    g_optimizer.apply_gradients(zip(grads, generator.trainable_weights))
    return d_loss, g_loss, generated_images

*이미지 배치에서 train_step을 반복적으로 호출하여 GAN을 학습*

In [21]:
import os

# Prepare the dataset. We use both the training & test MNIST digits.
batch_size = 64
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
all_digits = np.concatenate([x_train, x_test])
all_digits = all_digits.astype("float32") / 255.0
all_digits = np.reshape(all_digits, (-1, 28, 28, 1))
dataset = tf.data.Dataset.from_tensor_slices(all_digits)
dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)

epochs = 20
save_dir = "./"

for epoch in range(epochs):
    print("\nStart epoch", epoch)

    for step, real_images in enumerate(dataset):
        # Train the discriminator & generator on one batch of real images.
        d_loss, g_loss, generated_images = train_step(real_images)

        # Logging.
        if step % 200 == 0:
            # Print metrics
            print("discriminator loss at step %d: %.2f" % (step, d_loss))
            print("adversarial loss at step %d: %.2f" % (step, g_loss))

            # Save one generated image
            img = tf.keras.preprocessing.image.array_to_img(
                generated_images[0] * 255.0, scale=False
            )
            img.save(os.path.join(save_dir, "generated_img" + str(step) + ".png"))

        # To limit execution time we stop after 10 steps.
        # Remove the lines below to actually train the model!
        if step > 10:
            break


Start epoch 0
discriminator loss at step 0: 0.61
adversarial loss at step 0: 0.77

Start epoch 1
discriminator loss at step 0: 0.54
adversarial loss at step 0: 0.84

Start epoch 2
discriminator loss at step 0: 0.47
adversarial loss at step 0: 0.72

Start epoch 3
discriminator loss at step 0: 0.40
adversarial loss at step 0: 0.74

Start epoch 4
discriminator loss at step 0: 0.37
adversarial loss at step 0: 0.78

Start epoch 5
discriminator loss at step 0: 0.34
adversarial loss at step 0: 0.83

Start epoch 6
discriminator loss at step 0: 0.31
adversarial loss at step 0: 0.93

Start epoch 7
discriminator loss at step 0: 0.26
adversarial loss at step 0: 1.08

Start epoch 8
discriminator loss at step 0: 0.21
adversarial loss at step 0: 1.30

Start epoch 9
discriminator loss at step 0: 0.17
adversarial loss at step 0: 1.61

Start epoch 10
discriminator loss at step 0: 0.19
adversarial loss at step 0: 1.82

Start epoch 11
discriminator loss at step 0: 0.25
adversarial loss at step 0: 1.91

S