In [14]:
from tensorflow import keras
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint
import os 
import matplotlib.pyplot as plt

In [3]:
# change initializer in hopes of speeding training and avoiding exploding or disapearing gradients
Dense(10, activation='relu', kernel_initializer='he_normal')
# different one
he_avg_init = keras.initializers.VarianceScaling(scale=2., mode='fan_avg', distribution='uniform')
Dense(10, activation='sigmoid', kernel_initializer=he_avg_init)

<tensorflow.python.keras.layers.core.Dense at 0x7f8860da6850>

In [5]:
# typical preformance SELU > ELU > leaky ReLU > ReLU > tanh > logistic
# adding a leaky ReLU activation function

model = keras.models.Sequential([
    # model input 
    Dense(10, kernel_initializer='he_normal'),
    keras.layers.LeakyReLU(alpha=0.2)
    # rest of model
])

In [6]:
# for selu you need a special initial conditions
layers = Dense(10, activation='selu', kernel_initializer='lecun_normal')

In [8]:
# using batch normalization on images

model = keras.models.Sequential([
    Flatten(input_shape=[28,28]),
    BatchNormalization(),
    Dense(300, activation='elu', kernel_initializer='he_normal'),
    BatchNormalization(),
    Dense(100, activation='elu', kernel_initializer='he_normal'),
    BatchNormalization(),
    Dense(10, activation='softmax')
])

In [9]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 784)               3136      
_________________________________________________________________
dense_5 (Dense)              (None, 300)               235500    
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
dense_6 (Dense)              (None, 100)               30100     
_________________________________________________________________
batch_normalization_2 (Batch (None, 100)               400       
_________________________________________________________________
dense_7 (Dense)              (None, 10)               

In [12]:
[(var.name, var.trainable) for var in model.layers[1].variables]

[('batch_normalization/gamma:0', True),
 ('batch_normalization/beta:0', True),
 ('batch_normalization/moving_mean:0', False),
 ('batch_normalization/moving_variance:0', False)]

In [15]:
# if putting BN after layer you need to seperate the activation function
model = keras.models.Sequential([
    Flatten(input_shape=[28,28]),
    BatchNormalization(),
    Dense(300, use_bias=False, kernel_initializer='he_normal'),
    BatchNormalization(),
    Activation('elu'),
    Dense(100, use_bias=False, kernel_initializer='he_normal'),
    BatchNormalization(),
    Activation('elu'),
    Dense(10, activation='softmax')
])

In [16]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 784)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 784)               3136      
_________________________________________________________________
dense_8 (Dense)              (None, 300)               235200    
_________________________________________________________________
batch_normalization_4 (Batch (None, 300)               1200      
_________________________________________________________________
activation (Activation)      (None, 300)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 100)               30000     
_________________________________________________________________
batch_normalization_5 (Batch (None, 100)              

In [23]:
# other approach it so do gradient clipping to avoid exploding gradient
optimizer = keras.optimizers.SGD(clipvalue=1.0)
model.compile(loss='mse', optimizer=optimizer)

# if you want to preserve direction, but still clip
optimizer = keras.optimizers.SGD(clipvalue=1.0, clipnorm=1.0)
model.compile(loss='mse', optimizer=optimizer)
model.save('my_model_A.h5')

In [27]:
# transfer learning assuming you have some pretrained model "my_model_A.h5"

model_A = keras.models.load_model('my_model_A.h5')
# clone model A, so its layers are not changed
model_A_clone = keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())
# add all but ouput layer
model_B_on_A = keras.models.Sequential(model_A_clone.layers[:-1])
model_B_on_A.add(Dense(1, activation='sigmoid'))

In [28]:
# set all non output layers to non-trainable to aviod destroying the old model
# before the output layer is okay

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False

model_B_on_A.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [30]:
# loading some data
split_idx = 10000
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
x_train, x_valid = x_train[split_idx:] / 255.0, x_train[:split_idx] / 255.0
y_train, y_valid = y_train[split_idx:], y_train[:split_idx]
x_test = x_test / 255.0

In [33]:
# train for a few epochos and then unfreezing layers

history = model_B_on_A.fit(x_train, y_train, epochs=4, validation_data=(x_valid, y_valid))

for layers in model_B_on_A.layers[:-1]:
    layer.trainable = True
    
# lower learning rate to avoid destroying old weights
optimizer = keras.optimizers.SGD(lr=1e-4) # was 1e-2
model_B_on_A.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model_B_on_A.fit(x_train, y_train, epochs=16, validation_data=(x_valid, y_valid))

# this does not work super well for dense DNN, but it will be revisited for CNN

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
