# Training DNN

## 1. Xavier and He initialization

In [None]:
%%time
# Datasets => model => loss => train => eval => Session().train+eval+test+log
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
import numpy as np

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
n_output = 10
n_epoch = 40
batch_size = 64
lr = 0.01

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int32, shape=(None), name='y')

with tf.name_scope('dnn'):
    he_init = tf.variance_scaling_initializer()
    hidden1 = tf.layers.dense(inputs=X, units=n_hidden1, activation=tf.nn.selu, kernel_initializer=he_init,name='hidden1')
    hidden2 = tf.layers.dense(inputs=hidden1,units=n_hidden2,activation=tf.nn.selu,kernel_initializer=he_init,name='hidden2')
    logists = tf.layers.dense(inputs=hidden2,units=n_output,name = 'output')
    
with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,logits=logists)
    loss = tf.reduce_mean(xentropy, name='loss')    
    
with tf.name_scope('train'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
    training_op = optimizer.minimize(loss)    
    
with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(predictions=logists, targets=y, k=1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))  
    
saver = tf.train.Saver()
init = tf.global_variables_initializer()

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, n_inputs) / 255.
X_test  = X_test.astype(np.float32).reshape(-1, n_inputs) / 255.
y_train, y_test = y_train.astype(np.int32), y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch        
        
X_train_means = X_train.mean(axis=0, keepdims=True)
X_train_stds = X_train.std(axis=0, keepdims=True) + 1e-12
X_val_scaled = (X_valid - X_train_means) / X_train_stds
X_test_scaled = (X_test - X_train_means) / X_train_stds

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epoch):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            X_batch_scaled = (X_batch - X_train_means) / X_train_stds
            sess.run(training_op, feed_dict={X:X_batch_scaled, y:y_batch})
        if epoch % 5 == 0:
            acc_batch = accuracy.eval(feed_dict={X:X_batch_scaled, y:y_batch})
            acc_valid = accuracy.eval(feed_dict={X:X_val_scaled, y:y_valid})
            print(f'{epoch} Batch acc: {acc_batch}; Valid acc: {acc_valid}')
    acc_test = accuracy.eval(feed_dict={X:X_test_scaled, y:y_test})       
    print(f'Test acc: {acc_test}')

In [None]:
%%skip
"使用TF dataset APIs 并行处理数据和计算，替代传统的feed-dict方式"
# importing Data => create Iterator => consuming Data

import tensorflow as tf
import numpy as np
import time, os, sys
import matplotlib.pylab as plt

n_input = 28 * 28

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

X_train = X_train.astype(np.float32).reshape(-1, n_input)
X_test = X_test.astype(np.float32).reshape(-1, n_input)

y_train, y_test = y_train.astype(np.int32), y_test.astype(np.int32)
datasetX, datasety = np.r_[X_train, X_test], np.r_[y_train, y_test]

"从numpy中获取数据"
dataset = tf.data.Dataset.from_tensor_slices((datasetX, datasety))
if False:
    dataset

"四种迭代器"
"1. One shot迭代器"
iter = dataset.make_one_shot_iterator(), "call get_next() to use dateset"
el = iter.get_next()

tf.reset_default_graph()
"2. 可初始化迭代器"
X = tf.placeholder(dtype=np.float32, shape=(None, n_input), name='X')
y = tf.placeholder(dtype=np.int32, shape=(None), name='y')

"从占位符中获取数据，使用可初始化迭代器喂数据"
dataset = tf.data.Dataset.from_tensor_slices((X, y))
init_iter = dataset.make_initializable_iterator()
features, labels = init_iter.get_next()
with tf.Session() as sess:
    sess.run(init_iter.initializer, feed_dict={X: datasetX, y:datasety}) # 初始化 迭代器
    mnist_img, minst_label = sess.run([features, labels]) # 迭代一次 迭代器
    plt.imshow(X=mnist_img.reshape(28,28), cmap='gray')
    print(f'label of following mnist image is {minst_label}')
    
"3. 可重初始化的迭代器"
tf.reset_default_graph()
X = tf.placeholder(dtype=np.float32, shape=(None, n_input), name='X')
y = tf.placeholder(dtype=np.int32, shape=(None), name='y')

dataset = tf.data.Dataset.from_tensor_slices((X, y))
# test_data = tf.data.Dataset.from_tensor_slices((X_test, y_test))

"创建通用性迭代器，该迭代器无initializer方法，预初始化迭代器应使用Iterator.make_initializer(dataset)生成可重用迭代器"
iter = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes)

features, labels = iter.get_next() # 张量生成器

"多次初始化 init_op"
train_init_op = iter.make_initializer(dataset)
# test_init_op  = iter.make_initializer(test_data)

with tf.Session() as sess:
    sess.run(train_init_op, feed_dict={X:X_train, y:y_train}) # 切换到训练数据集
    mnist_img, minst_label = sess.run([features, labels])
#     plt.imshow(X=mnist_img.reshape(28,28), cmap='gray')
    print(f'label of following Train mnist image is {minst_label}')
    assert(minst_label == 5)
#     plt.show()
    
    sess.run(train_init_op, feed_dict={X:X_test, y:y_test}) # 切换到测试数据集
    mnist_img, minst_label = sess.run([features, labels])
#     plt.imshow(X=mnist_img.reshape(28,28), cmap='gray')
    print(f'label of following Test mnist image is {minst_label}')
    assert(minst_label == 7)
#     plt.show()


In [1]:
%%time
# tf.data 全数据 batch手动正则化
import tensorflow as tf
import numpy as np

n_input = 28 * 28
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, n_input) / 255.
X_test  = X_test.astype(np.float32).reshape(-1, n_input) / 255.
y_train ,y_test= y_train.astype(np.int32), y_test.astype(np.int32)
X_val, X_train = X_train[:5000], X_train[5000:]
y_val, y_train = y_train[:5000], y_train[5000:]

n_hidden1 = 300
n_hidden2 = 100
n_output = 10
lr = 0.01
n_epoch = 40
batch_size = 64
n_batch = X_train.shape[0] // batch_size

X = tf.placeholder(tf.float32, shape=(None, n_input), name='X')
y = tf.placeholder(tf.int32, shape=(None), name='y')

with tf.name_scope('dnn'):
    he_init = tf.variance_scaling_initializer()
    hidden1 = tf.layers.dense(inputs=X,       units=n_hidden1, activation=tf.nn.selu, kernel_initializer=he_init, name='hidden1')
    hidden2 = tf.layers.dense(inputs=hidden1, units=n_hidden2, activation=tf.nn.selu, kernel_initializer=he_init, name='hidden2')
    logits  = tf.layers.dense(inputs=hidden2, units=n_output,                                                     name='logits')
    
with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name='loss')
    
with tf.name_scope('train'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
    training_op = optimizer.minimize(loss)
    
with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(predictions=logits, targets=y, k=1)
    accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
    
dataset = tf.data.Dataset.from_tensor_slices((X, y)).batch(batch_size).shuffle(X_train.shape[0]) # 无限随机生成
iter = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) # 可重初始化的迭代器
features, labels = iter.get_next() # 张量生成器
init_op = iter.make_initializer(dataset) # 迭代器初始化操作

X_train_means = X_train.mean(axis=0, keepdims=True)
X_train_stds = (X_train.std(axis=0, keepdims=True) + 1e-12)
data_scaled = lambda _:(_ - X_train_means) / X_train_stds
X_test_scaled, X_val_scaled = data_scaled(X_test), data_scaled(X_val)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epoch):  
        sess.run(init_op, feed_dict={X:X_train, y:y_train}) # 从 X_train 中初始化迭代器
        for _ in range(n_batch):
            features_, labels_ = sess.run([features, labels])
            features_scaled = data_scaled(features_) # 不在外面正则化X_train，选在在epoch中batch_size规模计算
            sess.run(training_op, feed_dict={X:features_scaled, y:labels_}) 
        if epoch % 5 == 0:
            acc_batch = accuracy.eval(feed_dict={X:features_scaled, y:labels_})
            acc_valid = accuracy.eval(feed_dict={X:X_val_scaled, y:y_val})
            print(f'{epoch} Batch acc is {acc_batch}; Val acc is {acc_valid}')  
    acc_test = accuracy.eval(feed_dict={X:X_test_scaled, y:y_test})
    print(f'Test acc is {acc_test}')

0 Batch acc is 0.96875; Val acc is 0.9168000221252441
5 Batch acc is 0.953125; Val acc is 0.9513999819755554
10 Batch acc is 0.96875; Val acc is 0.9624000191688538
15 Batch acc is 0.984375; Val acc is 0.9648000001907349
20 Batch acc is 1.0; Val acc is 0.9666000008583069
25 Batch acc is 1.0; Val acc is 0.9675999879837036
30 Batch acc is 1.0; Val acc is 0.9684000015258789
35 Batch acc is 1.0; Val acc is 0.9693999886512756
Test acc is 0.972000002861023
CPU times: user 9min 30s, sys: 35 s, total: 10min 5s
Wall time: 4min 19s


## 2. Nonsaturating Activation Functions

SELU, ELU, Relu

## 3. Batch Normalization

使用BN通常不用再正则化训练集

In [1]:
import tensorflow as tf
import numpy as np

tf.reset_default_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int32, shape=(None), name='y')

training = tf.placeholder_with_default(input=False, shape=(), name='training')

In [5]:
hidden1 = tf.layers.dense(inputs=X, units=n_hidden1, name='hidden1')
bn1 = tf.layers.batch_normalization(inputs=hidden1, training=training, momentum=0.9)
bn1_act = tf.nn.elu(bn1) # 在激活函数前完成BN层
# 重复为hidden2完成BN
hidden2 = tf.layers.dense(inputs=bn1_act, units=n_hidden2, name='hidden2')
bn2 = tf.layers.batch_normalization(inputs=hidden2, training=training, momentum=0.9)
bn2_act = tf.nn.elu(bn2) # 在激活函数前完成BN层
# 重复为Softmax分类层完成BN
logists_before_bn = tf.layers.dense(inputs=bn2_act, units=n_outputs, name='logists')
logists = tf.layers.batch_normalization(inputs=logists_before_bn, training=training, momentum=0.9)
# 最后一层无需激活函数

In [None]:
from functools import partial

batch_norm_layer = partial(tf.layers.batch_normalization, training=training, momentum=0.9)
hidden1 = tf.layers.dense(inputs=X, units=n_hidden1, name='hidden1')
bn1 = batch_norm_layer(hidden1)
bn1_act = tf.nn.elu(bn1)

hidden2 = tf.layers.dense(inputs=bn1_act, units=n_hidden2, name='hidden2')
bn2 = batch_norm_layer(hidden2)
bn2_act = tf.nn.elu(bn2)

logists_before_bn = tf.layers.dense(inputs=bn2_act, units=n_outputs, name='logists')
logists = batch_norm_layer(logists_before_bn)

