# Softmax Classification

# 0. Softmax를 이용한 MNIST 기본 구현

In [None]:
# 0. Softmax를 이용한 MNIST 기본 구현

import tensorflow as tf
import random

tf.random.set_seed(1234)

learning_rate = 0.1
training_cnt = 15
batch_size = 100

mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train, x_test = x_train.astype('float32').reshape(60000,784), x_test.astype('float32').reshape(10000,784)
y_train, y_test = tf.one_hot(y_train,10), tf.one_hot(y_test,10) 

W = tf.Variable(tf.random.normal([784, 10]))
b = tf.Variable(tf.random.normal([10]))

for epoch in range(training_cnt):
    total_batch = int(len(x_train) / batch_size)
    avg_cost = 0
    correct_prediction = []
    for i in range(total_batch): 
        start = batch_size * i
        end = batch_size*(1+i)
        batch_xs, batch_ys = x_train[start:end], y_train[start:end]
        with tf.GradientTape() as tape:
            logits = tf.matmul(batch_xs,W) + b
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_ys))
        W_grad, b_grad = tape.gradient(cost, [W,b])
        W.assign_sub(learning_rate*W_grad)
        b.assign_sub(learning_rate*b_grad)
        pred = tf.nn.softmax(logits)
        prediction = tf.argmax(pred, 1)
        avg_cost += cost.numpy() / total_batch
        correct_prediction = tf.concat([correct_prediction,prediction],0)
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning Finished!')

true_Y = tf.argmax(y_train, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(correct_prediction, true_Y), dtype=tf.float32))
print('Accuracy(train):','{:.3f}'.format(accuracy)) # train set에 대한 accuracy

with tf.GradientTape() as tape:
    logits = tf.matmul(x_test,W) + b
    pred = tf.nn.softmax(logits)
    prediction = tf.argmax(pred, 1)
    true_Y = tf.argmax(y_test, 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, true_Y), dtype=tf.float32))

print('Accuracy(test):','{:.3f}'.format(accuracy)) # test set에 대한 accuracy

r = random.randint(0, len(x_test) - 1)
print("Label: ", tf.argmax(y_test[r:r + 1], 1).numpy())
print("Prediction: ", prediction[r:r+1].numpy())

#### seed 값 설정
- 재현성을 위해 tf.random.set_seed 함수를 사용

In [None]:
tf.random.set_seed(1234)

#### 파라메터 값 설정
- learning rate: weight가 발산되지 않도록 조정하는 값으로 weight값이 너무 작으면 train이 되지 않을 수 있고, 너무 크면 overshooting이 발생할 수 있다
- training_cnt: 전체 데이터셋에 대한 학습 반복 횟수(Epoch)
- batch_size: 한번에 학습할 데이터의 수

In [None]:
learning_rate = 0.1
training_cnt = 15
batch_size = 100

#### tf.random.normal
- 784개의 픽셀마다 가중치들을 각각 학습하여 0부터 9까지 숫자를 인식
- weight, bias의 초기값을 난수로 생성

In [None]:
W = tf.Variable(tf.random.normal([784, 10]))
b = tf.Variable(tf.random.normal([10]))

#### matmul 함수 사용
- 입력 X와 가중치 W를 곱하고 편향 b를 더하여 모델을 정의한다

In [None]:
logits = tf.matmul(batch_xs,W) + b

#### cost/loss function 구현
- 교차 엔트로피(cross-entropy) 사용
- 예측값과 실제값 사이의 확률분포 차이 계산
- 학습 방법으로 GradientDescent 함수 사용 (경사하강법)

In [None]:
with tf.GradientTape() as tape:
    logits = tf.matmul(batch_xs,W) + b
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_ys))
W_grad, b_grad = tape.gradient(cost, [W,b])
W.assign_sub(learning_rate*W_grad)
b.assign_sub(learning_rate*b_grad)

#### 학습된 예측값 확인, 정확도 계산
- softmax 함수를 적용하여 출력값의 합이 항상 1이 되게 한다
- 예측된 최대값의 index 반환
- One-hot encoding한 Y값도 최대값 1이 있는 index 반환
- 평균을 이용하여 예측값과 실제 데이터의 일치 여부를 계산

In [None]:
pred = tf.nn.softmax(logits)
prediction = tf.argmax(pred, 1)
true_Y = tf.argmax(y_train, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(correct_prediction, true_Y), dtype=tf.float32))

#### 모델 실행
- Tensorflow에서 제공하는 훈련 데이터(train data)가 60000개이기 때문에 여러 개의 batch로 나누어 학습을 진행하는 것이 효율적이다
- total_batch는 60000/100 = 600이다
- training_cnt만큼 반복하는 for문 안에 total_batch만큼 반복하는 for문이 포함되어 있다
- avg_cost는 전체 cost를 total_batch만큼 나눈 값을 더하여 계산된다

In [None]:
for epoch in range(training_cnt):
    total_batch = int(len(x_train) / batch_size)
    avg_cost = 0
    correct_prediction = []
    for i in range(total_batch): 
        start = batch_size * i
        end = batch_size*(1+i)
        batch_xs, batch_ys = x_train[start:end], y_train[start:end]
        with tf.GradientTape() as tape:
            logits = tf.matmul(batch_xs,W) + b
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_ys))
        W_grad, b_grad = tape.gradient(cost, [W,b])
        W.assign_sub(learning_rate*W_grad)
        b.assign_sub(learning_rate*b_grad)
        pred = tf.nn.softmax(logits)
        prediction = tf.argmax(pred, 1)
        avg_cost += cost.numpy() / total_batch
        correct_prediction = tf.concat([correct_prediction,prediction],0)
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning Finished!')

- 전체 데이터 셋을 반복한 단계와 각 단계에 해당하는 cost인 avg_cost를 출력
- Accuracy는 훈련 데이터(Train data)로 학습한 모델을 시험 데이터(Test data)를 대상으로 적용한 정확도를 나타낸다

In [None]:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning Finished!')

#### 모델 검정(test)
- Accuracy는 학습한 모델로 훈련 데이터(Train data)와 시험 데이터(Test data)를 대상으로 적용한 정확도를 나타낸다
- r은 시험데이터(Test data)에서 랜덤하게 1개를 읽어 온 것이다
- Label은 r에 해당하는 0~9사이의 실제 레이블(Label)
- Prediction은 r에 해당하는 이미지의 예측값(0~9사이의 레이블)

In [None]:
print('Accuracy(train):','{:.3f}'.format(accuracy)) # train set에 대한 accuracy

with tf.GradientTape() as tape:
    logits = tf.matmul(x_test,W) + b
    pred = tf.nn.softmax(logits)
    prediction = tf.argmax(pred, 1)
    true_Y = tf.argmax(y_test, 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, true_Y), dtype=tf.float32))

print('Accuracy(test):','{:.3f}'.format(accuracy)) # test set에 대한 accuracy

r = random.randint(0, len(x_test) - 1)
print("Label: ", tf.argmax(y_test[r:r + 1], 1).numpy())
print("Prediction: ", prediction[r:r+1].numpy())

# 1. Deep Neural Network와 ReLU를 추가하여 모델 변경
- 심층신경망(Deep Neural Network) 구성과 ReLU함수 사용
- 이전 softmax를 이용한 구현과 다른 점은 여러 개의 layer를 추가하여 심층신경망을 구성한 것과 활성화 함수로 ReLU함수를 사용한 것이다
- 각 layer의 결과값이 다음 layer의 입력값으로 연결되는 것을 주목한다
- 동일한 결과값을 위해 seed 옵션을 설정한다

In [None]:
# 1. Deep Neural Network와 ReLU를 추가하여 모델 변경

import tensorflow as tf
import random

tf.random.set_seed(1234)

learning_rate = 0.1
training_cnt = 15
batch_size = 100

mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train, x_test = x_train.astype('float32').reshape(60000,784), x_test.astype('float32').reshape(10000,784)
y_train, y_test = tf.one_hot(y_train,10), tf.one_hot(y_test,10) 

tf.random.set_seed(1234)

# deep neural network와 ReLU 추가
W1 = tf.Variable(tf.random.normal([784, 256]))
b1 = tf.Variable(tf.random.normal([256]))

W2 = tf.Variable(tf.random.normal([256, 256]))
b2 = tf.Variable(tf.random.normal([256]))

W3 = tf.Variable(tf.random.normal([256, 10]))
b3 = tf.Variable(tf.random.normal([10]))

for epoch in range(training_cnt):
    total_batch = int(len(x_train) / batch_size)
    avg_cost = 0
    correct_prediction = []
    for i in range(total_batch): 
        start = batch_size * i
        end = batch_size*(1+i)
        batch_xs, batch_ys = x_train[start:end], y_train[start:end]
        with tf.GradientTape() as tape:
            L1 = tf.nn.relu(tf.matmul(batch_xs, W1) + b1)
            L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
            logits = tf.matmul(L2, W3) + b3
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_ys))
        weight_list = [W1,W2,W3,b1,b2,b3]
        grads = tape.gradient(cost, weight_list)
        for i in range(len(weight_list)):
            weight_list[i].assign_sub(learning_rate*grads[i])
        pred = tf.nn.softmax(logits)
        prediction = tf.argmax(pred, 1)
        avg_cost += cost.numpy() / total_batch
        correct_prediction = tf.concat([correct_prediction,prediction],0)
    
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning Finished!')

true_Y = tf.argmax(y_train, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(correct_prediction, true_Y), dtype=tf.float32))
print('Accuracy(train):','{:.3f}'.format(accuracy)) # train set에 대한 accuracy


with tf.GradientTape() as tape:
    L1 = tf.nn.relu(tf.matmul(x_test, W1) + b1)
    L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
    logits = tf.matmul(L2, W3) + b3
    pred = tf.nn.softmax(logits)
    prediction = tf.argmax(pred, 1)
    true_Y = tf.argmax(y_test, 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, true_Y), dtype=tf.float32))

print('Accuracy(test):','{:.3f}'.format(accuracy)) # test set에 대한 accuracy

r = random.randint(0, len(x_test) - 1)
print("Label: ", tf.argmax(y_test[r:r + 1], 1).numpy())
print("Prediction: ", prediction[r:r+1].numpy())

# 2. Learning Rate 조정
- Cost가 기존 대비 큰 값(2.xx)을 가지며 더 이상 0에 가깝게 수렴하지 않는다
- Overshooting의 가능성이 있으므로 learning_rate 값을 줄여본다
- learning_rate 값은 10의 누승으로 다양하게 변경해본다

In [None]:
# 2. Learning Rate 조정

import tensorflow as tf
import random

tf.random.set_seed(1234)

learning_rate = 0.01
training_cnt = 15
batch_size = 100

mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train, x_test = x_train.astype('float32').reshape(60000,784), x_test.astype('float32').reshape(10000,784)
y_train, y_test = tf.one_hot(y_train,10), tf.one_hot(y_test,10) 

tf.random.set_seed(1234)

W1 = tf.Variable(tf.random.normal([784, 256]))
b1 = tf.Variable(tf.random.normal([256]))

W2 = tf.Variable(tf.random.normal([256, 256]))
b2 = tf.Variable(tf.random.normal([256]))

W3 = tf.Variable(tf.random.normal([256, 10]))
b3 = tf.Variable(tf.random.normal([10]))

for epoch in range(training_cnt):
    total_batch = int(len(x_train) / batch_size)
    avg_cost = 0
    correct_prediction = []
    for i in range(total_batch): 
        start = batch_size * i
        end = batch_size*(1+i)
        batch_xs, batch_ys = x_train[start:end], y_train[start:end]
        with tf.GradientTape() as tape:
            L1 = tf.nn.relu(tf.matmul(batch_xs, W1) + b1)
            L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
            logits = tf.matmul(L2, W3) + b3
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_ys))
        weight_list = [W1,W2,W3,b1,b2,b3] 
        grads = tape.gradient(cost, weight_list)
        for i in range(len(weight_list)):
            weight_list[i].assign_sub(learning_rate*grads[i])
        pred = tf.nn.softmax(logits)
        prediction = tf.argmax(pred, 1)
        avg_cost += cost.numpy() / total_batch
        correct_prediction = tf.concat([correct_prediction,prediction],0)
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning Finished!')

true_Y = tf.argmax(y_train, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(correct_prediction, true_Y), dtype=tf.float32))
print('Accuracy(train):','{:.3f}'.format(accuracy)) # train set에 대한 accuracy


with tf.GradientTape() as tape:
    L1 = tf.nn.relu(tf.matmul(x_test, W1) + b1)
    L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
    logits = tf.matmul(L2, W3) + b3
    pred = tf.nn.softmax(logits)
    prediction = tf.argmax(pred, 1)
    true_Y = tf.argmax(y_test, 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, true_Y), dtype=tf.float32))

print('Accuracy(test):','{:.3f}'.format(accuracy)) # test set에 대한 accuracy

r = random.randint(0, len(x_test) - 1)
print("Label: ", tf.argmax(y_test[r:r + 1], 1).numpy())
print("Prediction: ", prediction[r:r+1].numpy())

# 3. Adam Optimizer 적용 (향상된 Optimizer 사용)
- 딥러닝 Optimizer 중 성능이 좋은 것으로 평가되는 Adam Optimizer를 사용한다

In [None]:
# 3. Adam Optimizer 적용

import tensorflow as tf
import random


tf.random.set_seed(1234)

learning_rate = 0.01
training_cnt = 15
batch_size = 100

mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train, x_test = x_train.astype('float32').reshape(60000,784), x_test.astype('float32').reshape(10000,784)
y_train, y_test = tf.one_hot(y_train,10), tf.one_hot(y_test,10) 

tf.random.set_seed(1234)

W1 = tf.Variable(tf.random.normal([784, 256]))
b1 = tf.Variable(tf.random.normal([256]))

W2 = tf.Variable(tf.random.normal([256, 256]))
b2 = tf.Variable(tf.random.normal([256]))

W3 = tf.Variable(tf.random.normal([256, 10]))
b3 = tf.Variable(tf.random.normal([10]))


opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)

for epoch in range(training_cnt):
    total_batch = int(len(x_train) / batch_size)
    avg_cost = 0
    correct_prediction = []
    for i in range(total_batch): 
        start = batch_size * i
        end = batch_size*(1+i)
        batch_xs, batch_ys = x_train[start:end], y_train[start:end]
        with tf.GradientTape() as tape:
            L1 = tf.nn.relu(tf.matmul(batch_xs, W1) + b1)
            L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
            logits = tf.matmul(L2, W3) + b3
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_ys))
        weight_list = [W1,W2,W3,b1,b2,b3]
        grads = tape.gradient(cost, weight_list)
        opt.apply_gradients(zip(grads, weight_list))
        pred = tf.nn.softmax(logits)
        prediction = tf.argmax(pred, 1)
        avg_cost += cost.numpy() / total_batch
        correct_prediction = tf.concat([correct_prediction,prediction],0)
    
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning Finished!')

true_Y = tf.argmax(y_train, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(correct_prediction, true_Y), dtype=tf.float32))
print('Accuracy(train):','{:.3f}'.format(accuracy)) # train set에 대한 accuracy


with tf.GradientTape() as tape:
    L1 = tf.nn.relu(tf.matmul(x_test, W1) + b1)
    L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
    logits = tf.matmul(L2, W3) + b3
    pred = tf.nn.softmax(logits)
    prediction = tf.argmax(pred, 1)
    true_Y = tf.argmax(y_test, 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, true_Y), dtype=tf.float32))

print('Accuracy(test):','{:.3f}'.format(accuracy)) # test set에 대한 accuracy

r = random.randint(0, len(x_test) - 1)
print("Label: ", tf.argmax(y_test[r:r + 1], 1).numpy())
print("Prediction: ", prediction[r:r+1].numpy())

# 4. Xavier Initializer 적용 (적절한 Weight Initializer 사용)
- 정확도를 높이기 위해 Xavier Initializer를 사용하여 가중치를 초기화 한다

In [None]:
# 4. Xavier Initializer 적용

import tensorflow as tf
import random

tf.random.set_seed(1234)

learning_rate = 0.01
training_cnt = 15
batch_size = 100

mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train, x_test = x_train.astype('float32').reshape(60000,784), x_test.astype('float32').reshape(10000,784)
y_train, y_test = tf.one_hot(y_train,10), tf.one_hot(y_test,10) 

tf.random.set_seed(1234)

initializer = tf.initializers.GlorotUniform() # Xavier Initializer
W1 = tf.Variable(initializer(shape=(784, 256)))

b1 = tf.Variable(tf.random.normal([256]))

W2 = tf.Variable(initializer(shape=(256, 256)))
b2 = tf.Variable(tf.random.normal([256]))


W3 = tf.Variable(initializer(shape=(256, 10)))
b3 = tf.Variable(tf.random.normal([10]))

opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)

for epoch in range(training_cnt):
    total_batch = int(len(x_train) / batch_size)
    avg_cost = 0
    correct_prediction = []
    for i in range(total_batch): 
        start = batch_size * i
        end = batch_size*(1+i)
        batch_xs, batch_ys = x_train[start:end], y_train[start:end]
        with tf.GradientTape() as tape:
            L1 = tf.nn.relu(tf.matmul(batch_xs, W1) + b1)
            L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
            logits = tf.matmul(L2, W3) + b3
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_ys))
        weight_list = [W1,W2,W3,b1,b2,b3]
        grads = tape.gradient(cost, weight_list)
        opt.apply_gradients(zip(grads, weight_list))
        pred = tf.nn.softmax(logits)
        prediction = tf.argmax(pred, 1)
        avg_cost += cost.numpy() / total_batch
        correct_prediction = tf.concat([correct_prediction,prediction],0)
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning Finished!')

true_Y = tf.argmax(y_train, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(correct_prediction, true_Y), dtype=tf.float32))
print('Accuracy(train):','{:.3f}'.format(accuracy)) # train set에 대한 accuracy


with tf.GradientTape() as tape:
    L1 = tf.nn.relu(tf.matmul(x_test, W1) + b1)
    L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
    logits = tf.matmul(L2, W3) + b3
    pred = tf.nn.softmax(logits)
    prediction = tf.argmax(pred, 1)
    true_Y = tf.argmax(y_test, 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, true_Y), dtype=tf.float32))

print('Accuracy(test):','{:.3f}'.format(accuracy)) # test set에 대한 accuracy

r = random.randint(0, len(x_test) - 1)
print("Label: ", tf.argmax(y_test[r:r + 1], 1).numpy())
print("Prediction: ", prediction[r:r+1].numpy())

# 5. learning_rate 재조정(0.01 -> 0.001)
- Weight 초기화 변경 이후 기존 대비 cost가 더 작은 값을 가지지만 학습 결과는 오히려 나빠졌다
- learning_rate를 변경해서 더 세밀하게 최적값을 찾도록 시도해본다
- learning_rate 값은 10의 누승으로 다양하게 변경해본다

In [None]:
# 5. learning_rate 재조정(0.01 -> 0.001)

import tensorflow as tf
import random

mnist = tf.keras.datasets.mnist

tf.random.set_seed(1234)

learning_rate = 0.001
training_cnt = 15
batch_size = 100

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train, x_test = x_train.astype('float32').reshape(60000,784), x_test.astype('float32').reshape(10000,784)
y_train, y_test = tf.one_hot(y_train,10), tf.one_hot(y_test,10) 

tf.random.set_seed(1234)

initializer = tf.initializers.GlorotUniform() # Xavier Initializer
W1 = tf.Variable(initializer(shape=(784, 256)))

b1 = tf.Variable(tf.random.normal([256]))

W2 = tf.Variable(initializer(shape=(256, 256)))
b2 = tf.Variable(tf.random.normal([256]))


W3 = tf.Variable(initializer(shape=(256, 10)))
b3 = tf.Variable(tf.random.normal([10]))

opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)

for epoch in range(training_cnt):
    total_batch = int(len(x_train) / batch_size)
    avg_cost = 0
    correct_prediction = []
    for i in range(total_batch): 
        start = batch_size * i
        end = batch_size*(1+i)
        batch_xs, batch_ys = x_train[start:end], y_train[start:end]
        with tf.GradientTape() as tape:
            L1 = tf.nn.relu(tf.matmul(batch_xs, W1) + b1)
            L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
            logits = tf.matmul(L2, W3) + b3
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_ys))
        weight_list = [W1,W2,W3,b1,b2,b3]
        grads = tape.gradient(cost, weight_list)
        opt.apply_gradients(zip(grads, weight_list))
        pred = tf.nn.softmax(logits)
        prediction = tf.argmax(pred, 1)
        true_Y = tf.argmax(batch_ys, 1)
        avg_cost += cost.numpy() / total_batch
        correct_prediction = tf.concat([correct_prediction,prediction],0)
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning Finished!')

true_Y = tf.argmax(y_train, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(correct_prediction, true_Y), dtype=tf.float32))
print('Accuracy(train):','{:.3f}'.format(accuracy)) # train set에 대한 accuracy


with tf.GradientTape() as tape:
    L1 = tf.nn.relu(tf.matmul(x_test, W1) + b1)
    L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
    logits = tf.matmul(L2, W3) + b3
    pred = tf.nn.softmax(logits)
    prediction = tf.argmax(pred, 1)
    true_Y = tf.argmax(y_test, 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, true_Y), dtype=tf.float32))

print('Accuracy(test):','{:.3f}'.format(accuracy)) # test set에 대한 accuracy

r = random.randint(0, len(x_test) - 1)
print("Label: ", tf.argmax(y_test[r:r + 1], 1).numpy())
print("Prediction: ", prediction[r:r+1].numpy())

# 6. He Initializer 적용
- Xavier Initializer 대신 He Initializer를 사용하여 가중치를 초기화 한다
- 활성화 함수로 ReLU를 사용하는 경우 He Initializer를 사용하는게 효과적이다
- 이 예제처럼 모형이 복잡하지 않은 경우에 사용해야 효과가 좋다


In [None]:
# 6. He Initializer 적용

import tensorflow as tf
import random

mnist = tf.keras.datasets.mnist

tf.random.set_seed(1234)

learning_rate = 0.001
training_cnt = 15
batch_size = 100

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train, x_test = x_train.astype('float32').reshape(60000,784), x_test.astype('float32').reshape(10000,784)
y_train, y_test = tf.one_hot(y_train,10), tf.one_hot(y_test,10) 

tf.random.set_seed(1234)

initializer = tf.keras.initializers.HeNormal() # He Initializer

W1 = tf.Variable(initializer(shape=(784, 256)))

b1 = tf.Variable(tf.random.normal([256]))

W2 = tf.Variable(initializer(shape=(256, 256)))
b2 = tf.Variable(tf.random.normal([256]))


W3 = tf.Variable(initializer(shape=(256, 10)))
b3 = tf.Variable(tf.random.normal([10]))

opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)

for epoch in range(training_cnt):
    total_batch = int(len(x_train) / batch_size)
    avg_cost = 0
    correct_prediction = []
    for i in range(total_batch): 
        start = batch_size * i
        end = batch_size*(1+i)
        batch_xs, batch_ys = x_train[start:end], y_train[start:end]
        with tf.GradientTape() as tape:
            L1 = tf.nn.relu(tf.matmul(batch_xs, W1) + b1)
            L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
            logits = tf.matmul(L2, W3) + b3
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_ys))
        weight_list = [W1,W2,W3,b1,b2,b3]
        grads = tape.gradient(cost, weight_list)
        opt.apply_gradients(zip(grads, weight_list))
        pred = tf.nn.softmax(logits)
        prediction = tf.argmax(pred, 1)
        avg_cost += cost.numpy() / total_batch
        correct_prediction = tf.concat([correct_prediction,prediction],0)
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning Finished!')
  
true_Y = tf.argmax(y_train, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(correct_prediction, true_Y), dtype=tf.float32))
print('Accuracy(train):','{:.3f}'.format(accuracy)) # train set에 대한 accuracy


with tf.GradientTape() as tape:
    L1 = tf.nn.relu(tf.matmul(x_test, W1) + b1)
    L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
    logits = tf.matmul(L2, W3) + b3
    pred = tf.nn.softmax(logits)
    prediction = tf.argmax(pred, 1)
    true_Y = tf.argmax(y_test, 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, true_Y), dtype=tf.float32))

print('Accuracy(test):','{:.3f}'.format(accuracy)) # test set에 대한 accuracy

r = random.randint(0, len(x_test) - 1)
print("Label: ", tf.argmax(y_test[r:r + 1], 1).numpy())
print("Prediction: ", prediction[r:r+1].numpy())

# 7. Deep & Wide Neural Network 확장
- 은닉층 노드 수를 증가시키고 레이어를 추가 

In [None]:
# 7. Deep & Wide Neural Network 확장

import tensorflow as tf
import random

tf.random.set_seed(1234)

learning_rate = 0.001
training_cnt = 15
batch_size = 100

mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train, x_test = x_train.astype('float32').reshape(60000,784), x_test.astype('float32').reshape(10000,784)
y_train, y_test = tf.one_hot(y_train,10), tf.one_hot(y_test,10) 

tf.random.set_seed(1234)

initializer = tf.initializers.HeNormal()

W1 = tf.Variable(initializer(shape=(784, 512)))
b1 = tf.Variable(tf.random.normal([512]))

W2 = tf.Variable(initializer(shape=(512, 512)))
b2 = tf.Variable(tf.random.normal([512]))


W3 = tf.Variable(initializer(shape=(512, 512)))
b3 = tf.Variable(tf.random.normal([512]))

W4 = tf.Variable(initializer(shape=(512,512)))
b4 = tf.Variable(tf.random.normal([512]))

W5 = tf.Variable(initializer(shape=(512,10)))
b5 = tf.Variable(tf.random.normal([10]))

opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)

for epoch in range(training_cnt):
    total_batch = int(len(x_train) / batch_size)
    avg_cost = 0
    correct_prediction = []
    for i in range(total_batch): 
        start = batch_size * i
        end = batch_size*(1+i)
        batch_xs, batch_ys = x_train[start:end], y_train[start:end]
        with tf.GradientTape() as tape:
            L1 = tf.nn.relu(tf.matmul(batch_xs, W1) + b1)
            L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
            L3 = tf.nn.relu(tf.matmul(L2, W3) + b3)
            L4 = tf.nn.relu(tf.matmul(L3, W4) + b4)
            logits = tf.matmul(L4, W5) + b5
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_ys))
        weight_list = [W1,W2,W3,W4,W5,b1,b2,b3,b4,b5]
        grads = tape.gradient(cost, weight_list)
        opt.apply_gradients(zip(grads, weight_list))
        pred = tf.nn.softmax(logits)
        prediction = tf.argmax(pred, 1)
        avg_cost += cost.numpy() / total_batch
        correct_prediction = tf.concat([correct_prediction,prediction],0)
    
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning Finished!')

true_Y = tf.argmax(y_train, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(correct_prediction, true_Y), dtype=tf.float32))
print('Accuracy(train):','{:.3f}'.format(accuracy)) # train set에 대한 accuracy


with tf.GradientTape() as tape:
    L1 = tf.nn.relu(tf.matmul(x_test, W1) + b1)
    L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
    L3 = tf.nn.relu(tf.matmul(L2, W3) + b3)
    L4 = tf.nn.relu(tf.matmul(L3, W4) + b4)
    logits = tf.matmul(L4, W5) + b5
    pred = tf.nn.softmax(logits)
    prediction = tf.argmax(pred, 1)
    true_Y = tf.argmax(y_test, 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, true_Y), dtype=tf.float32))

print('Accuracy(test):','{:.3f}'.format(accuracy)) # test set에 대한 accuracy

r = random.randint(0, len(x_test) - 1)
print("Label: ", tf.argmax(y_test[r:r + 1], 1).numpy())
print("Prediction: ", prediction[r:r+1].numpy())


# 8. Dropout 적용
- Dropout은 Overfitting이 일어나지 않도록 중간 중간 무작위로 뉴런을 비활성화하여 성능을 향상시키는 방법이다
- 학습 시간은 다소 길어지지만 모델의 일반적인 예측 성능을 높여준다

In [None]:
# 8. Dropout 적용

import tensorflow as tf
import random

tf.random.set_seed(1234)

learning_rate = 0.001
training_cnt = 15
batch_size = 100

mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train, x_test = x_train.astype('float32').reshape(60000,784), x_test.astype('float32').reshape(10000,784)
y_train, y_test = tf.one_hot(y_train,10), tf.one_hot(y_test,10) 

tf.random.set_seed(1234)

rate = 0.7

initializer = tf.initializers.HeNormal()

W1 = tf.Variable(initializer(shape=(784, 512)))
b1 = tf.Variable(tf.random.normal([512]))

W2 = tf.Variable(initializer(shape=(512, 512)))
b2 = tf.Variable(tf.random.normal([512]))


W3 = tf.Variable(initializer(shape=(512, 512)))
b3 = tf.Variable(tf.random.normal([512]))

W4 = tf.Variable(initializer(shape=(512,512)))
b4 = tf.Variable(tf.random.normal([512]))

W5 = tf.Variable(initializer(shape=(512,10)))
b5 = tf.Variable(tf.random.normal([10]))

opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)

for epoch in range(training_cnt):
    total_batch = int(len(x_train) / batch_size)
    avg_cost = 0
    correct_prediction = []
    for i in range(total_batch): 
        start = batch_size * i
        end = batch_size*(1+i)
        batch_xs, batch_ys = x_train[start:end], y_train[start:end]
        with tf.GradientTape() as tape:
            L1 = tf.nn.relu(tf.matmul(batch_xs, W1) + b1)
            L1 = tf.nn.dropout(L1, rate=rate)
            L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
            L2 = tf.nn.dropout(L2, rate=rate)
            L3 = tf.nn.relu(tf.matmul(L2, W3) + b3)
            L3 = tf.nn.dropout(L3, rate=rate)
            L4 = tf.nn.relu(tf.matmul(L3, W4) + b4)
            L4 = tf.nn.dropout(L4, rate=rate)
            logits = tf.matmul(L4, W5) + b5
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_ys))
        weight_list = [W1,W2,W3,W4,W5,b1,b2,b3,b4,b5]
        grads = tape.gradient(cost, weight_list)
        opt.apply_gradients(zip(grads, weight_list))
        pred = tf.nn.softmax(logits)
        prediction = tf.argmax(pred, 1)
        avg_cost += cost.numpy() / total_batch
        correct_prediction = tf.concat([correct_prediction,prediction],0)
    
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning Finished!')

true_Y = tf.argmax(y_train, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(correct_prediction, true_Y), dtype=tf.float32))
print('Accuracy(train):','{:.3f}'.format(accuracy)) # train set에 대한 accuracy


with tf.GradientTape() as tape:
    L1 = tf.nn.relu(tf.matmul(x_test, W1) + b1)
    L1 = tf.nn.dropout(L1, rate=rate)
    L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
    L2 = tf.nn.dropout(L2, rate=rate)
    L3 = tf.nn.relu(tf.matmul(L2, W3) + b3)
    L3 = tf.nn.dropout(L3, rate=rate)
    L4 = tf.nn.relu(tf.matmul(L3, W4) + b4)
    L4 = tf.nn.dropout(L4, rate=rate)
    logits = tf.matmul(L4, W5) + b5
    pred = tf.nn.softmax(logits)
    prediction = tf.argmax(pred, 1)
    true_Y = tf.argmax(y_test, 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, true_Y), dtype=tf.float32))

print('Accuracy(test):','{:.3f}'.format(accuracy)) # test set에 대한 accuracy

r = random.randint(0, len(x_test) - 1)
print("Label: ", tf.argmax(y_test[r:r + 1], 1).numpy())
print("Prediction: ", prediction[r:r+1].numpy())
