In [None]:
import tensorflow as tf
import numpy as np


n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10


In [None]:

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [None]:
model = tf.keras.Sequential((
    tf.keras.layers.Dense(units=n_hidden1, activation=tf.nn.elu),
    tf.keras.layers.BatchNormalization(momentum=0.9),
    # tf.nn.elu(),
    tf.keras.layers.Dense(units=n_hidden2, activation=tf.nn.elu),
    tf.keras.layers.BatchNormalization(momentum=.9, trainable=True),
    # tf.nn.elu(),
    tf.keras.layers.Dense(units=n_outputs, activation=tf.nn.softmax),
    # tf.keras.layers.Softmax(axis=0)
))


In [None]:
def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size

    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch

learning_rate = 0.01

def loss_sparse(labels, logits):
    return tf.reduce_mean(tf.losses.sparse_categorical_crossentropy(labels, logits))

optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)

def run_training(X, y):
    with tf.GradientTape() as g:
        pred = model(X)
        loss = loss_sparse(labels=y, logits=pred)

    gradients = g.gradient(loss, model.trainable_variables)
    # print("Gradient ", gradients)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


In [None]:
num_epochs = 20
batch_size = 64

for epoch in range(num_epochs):
    loss = 10
    for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size=batch_size):
        loss = run_training(X_batch, y_batch)
    print('Epoch %d Loss %.4f' % (epoch + 1, loss))

Epoch 1 Loss 0.0035
Epoch 2 Loss 0.0027
Epoch 3 Loss 0.0106
Epoch 4 Loss 0.0063
Epoch 5 Loss 0.0056
Epoch 6 Loss 0.0045
Epoch 7 Loss 0.0020
Epoch 8 Loss 0.0049
Epoch 9 Loss 0.0039
Epoch 10 Loss 0.0328
Epoch 11 Loss 0.0159
Epoch 12 Loss 0.0020
Epoch 13 Loss 0.0027
Epoch 14 Loss 0.0013
Epoch 15 Loss 0.0023
Epoch 16 Loss 0.0035
Epoch 17 Loss 0.0065
Epoch 18 Loss 0.0025
Epoch 19 Loss 0.0097
Epoch 20 Loss 0.0044


In [None]:

model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()
model.evaluate(X_valid, y_valid)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 300)               235500    
_________________________________________________________________
batch_normalization (BatchNo (None, 300)               1200      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               30100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 100)               400       
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1010      
Total params: 268,210
Trainable params: 267,410
Non-trainable params: 800
_________________________________________________________________


[0.0704062283039093, 0.9815999865531921]

In [None]:
def run_clipping(X, y, threshold=1.0):
    with tf.GradientTape() as g:
        pred = model(X)
        loss = loss_sparse(labels=y, logits=pred)

    gradients = g.gradient(loss, model.trainable_variables)
    # grads_and_vars = optimizer.compute_gradients(loss)
    capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var) for grad, var in zip(gradients, model.trainable_variables)]
    # print("Gradient ", gradients)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss



In [None]:
num_epochs = 20
batch_size = 64

for epoch in range(num_epochs):
    loss = 10
    for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size=batch_size):
        loss = run_clipping(X_batch, y_batch)
    print('Epoch %d Loss %.4f' % (epoch + 1, loss))

Epoch 1 Loss 0.5080
Epoch 2 Loss 0.1910
Epoch 3 Loss 0.3124
Epoch 4 Loss 0.4402
Epoch 5 Loss 0.0700
Epoch 6 Loss 0.2153
Epoch 7 Loss 0.3704
Epoch 8 Loss 0.1970
Epoch 9 Loss 0.1724
Epoch 10 Loss 0.2472
Epoch 11 Loss 0.2102
Epoch 12 Loss 0.2067
Epoch 13 Loss 0.1124
Epoch 14 Loss 0.1965
Epoch 15 Loss 0.0911
Epoch 16 Loss 0.3126
Epoch 17 Loss 0.1364
Epoch 18 Loss 0.1215
Epoch 19 Loss 0.0974
Epoch 20 Loss 0.1528


In [None]:
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()
model.evaluate(X_valid, y_valid)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 300)               235500    
_________________________________________________________________
batch_normalization (BatchNo (None, 300)               1200      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               30100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 100)               400       
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1010      
Total params: 268,210
Trainable params: 267,410
Non-trainable params: 800
_________________________________________________________________


[0.1351381242275238, 0.9620000123977661]

In [None]:
for epoch in range(num_epochs):
    loss = run_training(X_train, y_train)
    print("Epoch %d, loss %.4f" % (epoch, loss))

Epoch 0, loss 0.1302
Epoch 1, loss 0.1302
Epoch 2, loss 0.1301
Epoch 3, loss 0.1300
Epoch 4, loss 0.1300
Epoch 5, loss 0.1299
Epoch 6, loss 0.1299
Epoch 7, loss 0.1298
Epoch 8, loss 0.1298
Epoch 9, loss 0.1298
Epoch 10, loss 0.1297
Epoch 11, loss 0.1297
Epoch 12, loss 0.1297
Epoch 13, loss 0.1296
Epoch 14, loss 0.1296
Epoch 15, loss 0.1296
Epoch 16, loss 0.1296
Epoch 17, loss 0.1295
Epoch 18, loss 0.1295
Epoch 19, loss 0.1295


In [None]:
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()
model.evaluate(X_valid, y_valid)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 300)               235500    
_________________________________________________________________
batch_normalization (BatchNo (None, 300)               1200      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               30100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 100)               400       
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1010      
Total params: 268,210
Trainable params: 267,410
Non-trainable params: 800
_________________________________________________________________


[0.13439272344112396, 0.9617999792098999]

In [None]:
model.evaluate(X_valid, y_valid)



[0.13439272344112396, 0.9617999792098999]

In [None]:
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()
model.fit(X_train, y_train, epochs=20)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 300)               235500    
_________________________________________________________________
batch_normalization (BatchNo (None, 300)               1200      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               30100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 100)               400       
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1010      
Total params: 268,210
Trainable params: 267,410
Non-trainable params: 800
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epo

<tensorflow.python.keras.callbacks.History at 0x7f754e22f6a0>

In [None]:
model.evaluate(X_valid, y_valid)



[0.0672856867313385, 0.9805999994277954]