In [37]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
from functools import partial

In [2]:
# initializers in keras
[name for name in dir(keras.initializers) if not name.startswith("_")]

['Constant',
 'GlorotNormal',
 'GlorotUniform',
 'HeNormal',
 'HeUniform',
 'Identity',
 'Initializer',
 'LecunNormal',
 'LecunUniform',
 'Ones',
 'Orthogonal',
 'RandomNormal',
 'RandomUniform',
 'TruncatedNormal',
 'VarianceScaling',
 'Zeros',
 'constant',
 'deserialize',
 'get',
 'glorot_normal',
 'glorot_uniform',
 'he_normal',
 'he_uniform',
 'identity',
 'lecun_normal',
 'lecun_uniform',
 'ones',
 'orthogonal',
 'random_normal',
 'random_uniform',
 'serialize',
 'truncated_normal',
 'variance_scaling',
 'zeros']

In [3]:
# dataset 
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

In [4]:
tf.random.set_seed(42)
np.random.seed(42)

In [5]:
# custom intializers
init = keras.initializers.VarianceScaling(scale=2., mode='fan_avg',distribution='uniform')

# leaky relu
leaky_relu = keras.layers.LeakyReLU(alpha=0.2)

# selu
layer = keras.layers.Dense(10, activation="selu",kernel_initializer="lecun_normal")

In [6]:
# Model with HE initlialization and Leaky Relu activation

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal"),
                                 keras.layers.LeakyReLU(),
                                 keras.layers.Dense(100, kernel_initializer="he_normal"),
                                 keras.layers.LeakyReLU(),
                                 keras.layers.Dense(10, activation="softmax")])

In [7]:
model.compile(loss="sparse_categorical_crossentropy",optimizer=keras.optimizers.SGD(lr=1e-3),metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))              

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
tf.random.set_seed(42)
np.random.seed(42)

# Model with Lecun Normal initlialization and Leaky Relu activation

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, activation="selu",kernel_initializer="lecun_normal"),
                                 keras.layers.Dense(100, activation="selu",kernel_initializer="lecun_normal"),
                                 keras.layers.Dense(10, activation="softmax")])

# normalizing input
pixel_means = X_train.mean(axis=0, keepdims=True)
pixel_stds = X_train.std(axis=0, keepdims=True)
X_train_scaled = (X_train - pixel_means) / pixel_stds
X_valid_scaled = (X_valid - pixel_means) / pixel_stds
X_test_scaled = (X_test - pixel_means) / pixel_stds

# model complile
model.compile(loss="sparse_categorical_crossentropy",optimizer=keras.optimizers.SGD(lr=1e-3),metrics=["accuracy"])

# train the model
history = model.fit(X_train_scaled, y_train, epochs=10, validation_data=(X_valid_scaled, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
# Model with Prelu


tf.random.set_seed(42)
np.random.seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal"),
                                 keras.layers.PReLU(),
                                 keras.layers.Dense(100, kernel_initializer="he_normal"),
                                 keras.layers.PReLU(),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=keras.optimizers.SGD(lr=1e-3),metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))              

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
# Relu activation
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=keras.optimizers.SGD(lr=1e-3),metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))   

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
## Training with Batch Normalization after the activation function
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.BatchNormalization(),
                                 keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal"),
                                 keras.layers.BatchNormalization(),
                                 keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
                                 keras.layers.BatchNormalization(),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=keras.optimizers.SGD(lr=1e-3),metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid)) 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
# To sum up, four parameter vectors are learned in each batch-normalized
# layer: gamma (the ouput scale vector) and beta (the output offset vector) are learned through
# regular backpropagation, and mean (the final input mean vector), and variance (the final input
# standard deviation vector) are estimated using an exponential moving average. which are used for prediction

[(var.name, var.trainable) for var in model.layers[1].variables]

[('batch_normalization/gamma:0', True),
 ('batch_normalization/beta:0', True),
 ('batch_normalization/moving_mean:0', False),
 ('batch_normalization/moving_variance:0', False)]

In [13]:
## Training with Batch Normalization before the activation function
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.BatchNormalization(),
                                 keras.layers.Dense(300, kernel_initializer="he_normal", use_bias=False),
                                 keras.layers.BatchNormalization(),
                                 keras.layers.Activation("elu"),
                                 keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
                                 keras.layers.Activation("elu"),
                                 keras.layers.BatchNormalization(),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=keras.optimizers.SGD(lr=1e-3),metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid)) 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
# graddient clipping with clipvalue
optimizer = keras.optimizers.SGD(lr=1e-3,clipvalue=1.0)

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=optimizer,metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))   

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
# graddient clipping with clipnorm
optimizer = keras.optimizers.SGD(lr=1e-3,clipnorm=1.0)

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=optimizer,metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))  

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
## Resusing Mdelling - Create Base Model dataset


def split_dataset(X, y):
    y_5_or_6 = (y == 5) | (y == 6) # sandals or shirts
    y_A = y[~y_5_or_6]
    y_A[y_A > 6] -= 2 # class indices 7, 8, 9 should be moved to 5, 6, 7
    y_B = (y[y_5_or_6] == 6).astype(np.float32) # binary classification task: is it a shirt (class 6)?
    return ((X[~y_5_or_6], y_A),            (X[y_5_or_6], y_B))

(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

In [17]:
# Train Base Model A
tf.random.set_seed(42)
np.random.seed(42)

model_A = keras.models.Sequential()
model_A.add(keras.layers.Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_A.add(keras.layers.Dense(n_hidden, activation="selu"))
model_A.add(keras.layers.Dense(8, activation="softmax"))

model_A.compile(loss="sparse_categorical_crossentropy",optimizer=keras.optimizers.SGD(lr=1e-3),metrics=["accuracy"])
                

history = model_A.fit(X_train_A, y_train_A, epochs=20,validation_data=(X_valid_A, y_valid_A))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [18]:
model_A.save("my_model_A.h5")

In [19]:
### Training Model B from scratch
model_B = keras.models.Sequential()
model_B.add(keras.layers.Flatten(input_shape=[28, 28]))

for n_hidden in (300, 100, 50, 50, 50):
    model_B.add(keras.layers.Dense(n_hidden, activation="selu"))

model_B.add(keras.layers.Dense(1, activation="sigmoid"))

model_B.compile(loss="binary_crossentropy",optimizer=keras.optimizers.SGD(lr=1e-3),metrics=["accuracy"])

history = model_B.fit(X_train_B, y_train_B, epochs=20,validation_data=(X_valid_B, y_valid_B))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
# Training Model B by reusing layers from Model A


model_A = keras.models.load_model("my_model_A.h5")
model_B_on_A = keras.models.Sequential(model_A.layers[:-1])
model_B_on_A.add(keras.layers.Dense(1, activation="sigmoid"))


# clone Model A as training will modify the model paramters
model_A_clone = keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False

model_B_on_A.compile(loss="binary_crossentropy",optimizer=keras.optimizers.SGD(lr=1e-3),metrics=["accuracy"])

In [21]:
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4, validation_data=(X_valid_B, y_valid_B))

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True

model_B_on_A.compile(loss="binary_crossentropy",optimizer=keras.optimizers.SGD(lr=1e-3),metrics=["accuracy"])

history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16,validation_data=(X_valid_B, y_valid_B))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [22]:
model_B.evaluate(X_test_B, y_test_B)



[0.1408407837152481, 0.9704999923706055]

In [23]:
model_B_on_A.evaluate(X_test_B, y_test_B)



[0.06832191348075867, 0.9929999709129333]

In [24]:
# momentum optimization
np.random.seed(42)
tf.random.set_seed(42)

optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=optimizer,metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
# Nesterov Accelerated Gradient
optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=optimizer,metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
# Adagrad optimization
optimizer = keras.optimizers.Adagrad(lr=0.001)

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=optimizer,metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
# RMSProp
optimizer = keras.optimizers.RMSprop(lr=0.001, rho=0.9)

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=optimizer,metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
# Adam Optimization
optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=optimizer,metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
# Admax Optimization
optimizer = keras.optimizers.Adamax(lr=0.001, beta_1=0.9, beta_2=0.999)

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=optimizer,metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
# Nadam Optimization

optimizer = keras.optimizers.Nadam(lr=0.001, beta_1=0.9, beta_2=0.999)

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=optimizer,metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
# Learning Rate Schedule - Power Scheduling
"""

lr = lr0 / (1 + steps / s)**c

    Keras uses c=1 and s = 1 / decay

"""

optimizer = keras.optimizers.SGD(lr=0.01, decay=1e-4)

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer=optimizer,metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
## Learning Schedule - Exponential decay

def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        return lr0 * 0.1**(epoch / s)
    return exponential_decay_fn

exponential_decay_fn = exponential_decay(lr0=0.01, s=20)

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(100, kernel_initializer="he_normal",activation='relu'),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy",optimizer='nadam',metrics=["accuracy"])


lr_scheduler = keras.callbacks.LearningRateScheduler(exponential_decay_fn)
history = model.fit(X_train_scaled, y_train, epochs=10,validation_data=(X_valid_scaled, y_valid),
                    callbacks=[lr_scheduler])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
# exponential decay updating the learning rate at the end of iteration instead of batch

K = keras.backend

class ExponentialDecay(keras.callbacks.Callback):
    def __init__(self, s=40000):
        super().__init__()
        self.s = s

    def on_batch_begin(self, batch, logs=None):
        # Note: the `batch` argument is reset at each epoch
        lr = K.get_value(self.model.optimizer.lr)
        K.set_value(self.model.optimizer.lr, lr * 0.1**(1 / s))

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        logs['lr'] = K.get_value(self.model.optimizer.lr)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, activation="selu", kernel_initializer="lecun_normal"),
                                 keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"),
                                 keras.layers.Dense(10, activation="softmax")])

lr0 = 0.01
optimizer = keras.optimizers.Nadam(lr=lr0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
n_epochs = 10

s = 20 * len(X_train) // 32 # number of steps in 20 epochs (batch size = 32)

exp_decay = ExponentialDecay(s)

history = model.fit(X_train_scaled, y_train, epochs=n_epochs,validation_data=(X_valid_scaled, y_valid),
                    callbacks=[exp_decay])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
# Piece Wise constant learning rate
def piecewise_constant_fn(epoch):
    if epoch < 5:
        return 0.01
    elif epoch < 15:
        return 0.005
    else:
        return 0.00
    
    
lr_scheduler = keras.callbacks.LearningRateScheduler(piecewise_constant_fn)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, activation="selu", kernel_initializer="lecun_normal"),
                                 keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"),
                                 keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
n_epochs = 10
history = model.fit(X_train_scaled, y_train, epochs=n_epochs,validation_data=(X_valid_scaled, y_valid),
                    callbacks=[lr_scheduler])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
# Performance Sceduling - Chnage learning rate if validation best is not improved
tf.random.set_seed(42)
np.random.seed(42)

# Reduce learning rate when a metric has stopped improving.
# factor: factor by which the learning rate will be reduced. new_lr = lr * factor
lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, activation="selu", kernel_initializer="lecun_normal"),
                                 keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"),
                                 keras.layers.Dense(10, activation="softmax")])

optimizer = keras.optimizers.SGD(lr=0.02, momentum=0.9)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

n_epochs = 10
history = model.fit(X_train_scaled, y_train, epochs=n_epochs,validation_data=(X_valid_scaled, y_valid),
                    callbacks=[lr_scheduler])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
# learning rate schedule easy way 

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dense(300, activation="selu", kernel_initializer="lecun_normal"),
                                 keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"),
                                 keras.layers.Dense(10, activation="softmax")])


# This approach updates the learning rate at each step rather than at each epoch
s = 20 * len(X_train) // 32 # number of steps in 20 epochs (batch size = 32)

# A LearningRateSchedule that uses an exponential decay schedule.
# initial_learning_rate * decay_rate ^ (step / decay_steps)
learning_rate = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate= 0.01,decay_steps= s, decay_rate=0.1)

# optimizer with learning rate
optimizer = keras.optimizers.SGD(learning_rate)

model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

n_epochs = 10

history = model.fit(X_train_scaled, y_train, epochs=n_epochs,validation_data=(X_valid_scaled, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [38]:
#### L1 & L2 Regularization



# partial function to create dense layer so as to avoid repeating code 
RegularizedDense = partial(keras.layers.Dense, activation="elu", kernel_initializer="he_normal",
                           kernel_regularizer=keras.regularizers.l2(0.01))



model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 RegularizedDense(300),
                                 RegularizedDense(100),
                                 RegularizedDense(10, activation="softmax",kernel_initializer="glorot_uniform")])


optimizer = keras.optimizers.SGD()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

n_epochs = 3
history = model.fit(X_train_scaled, y_train, epochs=n_epochs,validation_data=(X_valid_scaled, y_valid))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [39]:
### Drop out regularization

model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28]),
                                 keras.layers.Dropout(rate=0.2),
                                 keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal"),
                                 keras.layers.Dropout(rate=0.2),
                                 keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
                                 keras.layers.Dropout(rate=0.2),
                                 keras.layers.Dense(10, activation="softmax")])

optimizer = keras.optimizers.SGD()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

n_epochs = 3
history = model.fit(X_train_scaled, y_train, epochs=n_epochs,validation_data=(X_valid_scaled, y_valid))

Epoch 1/3
Epoch 2/3
Epoch 3/3
