## Vanishing/Exploding Gradients

In [1]:
from tensorflow import keras

In [2]:
keras.layers.Dense(10, activation="relu", 
                    kernel_initializer="he_normal")

<keras.layers.core.dense.Dense at 0x1f812ceb670>

In [4]:
he_avg_init = keras.initializers.VarianceScaling(scale=2, 
                                                mode="fan_avg", 
                                                distribution="uniform")
keras.layers.Dense(10, activation="relu", 
                    kernel_initializer=he_avg_init)

<keras.layers.core.dense.Dense at 0x1f812dadb50>

In [5]:
# using leaky relu
leaky_relu = keras.layers.LeakyReLU(alpha=0.2)
keras.layers.Dense(10, activation=leaky_relu, 
                    kernel_initializer="he_normal")

<keras.layers.core.dense.Dense at 0x1f812dadd30>

In [6]:
keras.layers.Dense(10, activation="selu", 
                    kernel_initializer="lecun_normal")

<keras.layers.core.dense.Dense at 0x1f812dad970>

In [8]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]), 
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(300, activation="elu", 
                        kernel_initializer="he_normal"), 
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(100, activation="elu", 
                        kernel_initializer="he_normal"), 
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(10, activation="softmax")
])

In [9]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 784)               0         
                                                                 
 batch_normalization_3 (Batc  (None, 784)              3136      
 hNormalization)                                                 
                                                                 
 dense_7 (Dense)             (None, 300)               235500    
                                                                 
 batch_normalization_4 (Batc  (None, 300)              1200      
 hNormalization)                                                 
                                                                 
 dense_8 (Dense)             (None, 100)               30100     
                                                                 
 batch_normalization_5 (Batc  (None, 100)             

In [10]:
[(var.name, var.trainable) for var in model.layers[1].variables]

[('batch_normalization_3/gamma:0', True),
 ('batch_normalization_3/beta:0', True),
 ('batch_normalization_3/moving_mean:0', False),
 ('batch_normalization_3/moving_variance:0', False)]

In [11]:
model.layers[1].updates

  model.layers[1].updates


[]

In [2]:
from tensorflow import keras

In [3]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]), 
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(300, 
        kernel_initializer="he_normal", 
        use_bias=False), 
    keras.layers.BatchNormalization(), 
    keras.layers.Activation("elu"), 
    keras.layers.Dense(100, 
        kernel_initializer="he_normal", 
        use_bias=False), 
    keras.layers.BatchNormalization(), 
    keras.layers.Activation("elu"), 
    keras.layers.Dense(10, activation="softmax")
])

In [4]:
optimizer = keras.optimizers.SGD(clipvalue=1.0)
model.compile(loss="mse", optimizer=optimizer)

## Reusing Pretrained Layers

In [54]:
from tensorflow import keras
import numpy as np

def get_data_without_5_6(data, labels):
    new_data, new_labels = [], []
    for i, label in enumerate(labels):
        if (label != 5) & (label != 6):
            if label > 5:
                new_labels.append(labels[i]-2)
            else:
                new_labels.append(labels[i])
            new_data.append(data[i])
    return np.array(new_data), np.array(new_labels)
        
(X_train, y_train), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train_A, y_train_A = get_data_without_5_6(X_train, y_train)
X_test_A, y_test_A = get_data_without_5_6(X_test, y_test)
X_train_A, X_val_A, y_train_A, y_val_A = X_train_A[:30000], X_train_A[30000:], y_train_A[:30000], y_train_A[30000:]

In [55]:


model_A = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)), 
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(300, activation="relu"), 
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(100, activation="relu"), 
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(8, activation="softmax")
])
model_A.compile(loss="sparse_categorical_crossentropy", 
                optimizer="sgd", 
                metrics="accuracy")
model_A.fit(X_train_A, y_train_A, epochs=100, 
            validation_data=(X_val_A, y_val_A), 
            callbacks=[keras.callbacks.EarlyStopping(monitor="val_loss", 
                                                     patience=10, 
                                                     restore_best_weights=True)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100


<keras.callbacks.History at 0x2a46bfaf5e0>

In [30]:
model_A.evaluate(X_test_A, y_test_A)



[0.2615906894207001, 0.918624997138977]

In [44]:
def get_data_withonly_5_6(data, labels):
    new_data, new_labels = [], []
    for i, label in enumerate(labels):
        if label == 5:
            new_data.append(data[i])
            new_labels.append(1)
        elif label == 6:
            new_data.append(data[i])
            new_labels.append(0)
    return np.array(new_data), np.array(new_labels)

X_train_B, y_train_B = get_data_withonly_5_6(X_train, y_train)
X_test_B, y_test_B = get_data_withonly_5_6(X_test, y_test)
X_train_B, X_val_B, y_train_B, y_val_B = X_train_B[:10000], X_train_B[10000:], y_train_B[:10000], y_train_B[10000:]

In [45]:
model_B = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)), 
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(300, activation="relu"), 
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(100, activation="relu"), 
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(10, activation="relu"), 
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(1, activation="sigmoid")
])
model_B.compile(loss="binary_crossentropy", 
                optimizer="sgd", 
                metrics="accuracy")
model_B.fit(X_train_B, y_train_B, epochs=100, 
            validation_data=(X_val_B, y_val_B), 
            callbacks=[keras.callbacks.EarlyStopping(monitor="val_loss", 
                                                     patience=10, 
                                                     restore_best_weights=True)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100


<keras.callbacks.History at 0x2a46bd6a1c0>

In [46]:
model_B.evaluate(X_test_B, y_test_B)



[0.005876780953258276, 0.9980000257492065]

In [56]:
model_A.evaluate(X_test_A, y_test_A)



[0.25178515911102295, 0.9257500171661377]

In [58]:
model_B_on_A = keras.Sequential(model_A.layers[:-1])
model_B_on_A.add(keras.layers.Dense(1, activation="sigmoid"))

In [59]:
model_A_clone = keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

In [60]:
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False
model_B_on_A.compile(loss="binary_crossentropy", 
                     optimizer="sgd", 
                     metrics=["accuracy"])


In [61]:
history_b_on_A = model_B_on_A.fit(X_train_B, y_train_B, epochs=4, 
                                  validation_data=(X_val_B, y_val_B))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
# for layer in model_B_on_A.layers