In [1]:
import datetime
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from tensorflow import keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import backend as K

In [2]:
now = datetime.datetime.now
now

<function datetime.now(tz=None)>

In [3]:
# set some parameters
batch_size = 128
num_classes = 5
epochs = 5

In [4]:
# set some more parameters
img_rows, img_cols = 28, 28
filters = 32
pool_size = 2
kernel_size = 3

In [5]:
## This just handles some variability in how the input data is loaded

if K.image_data_format() == 'channels_first':
    input_shape = (1, img_rows, img_cols)
else:
    input_shape = (img_rows, img_cols, 1)

In [6]:
## To simplify things, write a function to include all the training steps
## As input, function takes a model, training set, test set, and the number of classes
## Inside the model object will be the state about which layers we are freezing and which we are training

def train_model(model, train, test, num_classes):
    x_train = train[0].reshape((train[0].shape[0],) + input_shape)
    x_test = test[0].reshape((test[0].shape[0],) + input_shape)
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255
    print('x_train shape:', x_train.shape)
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(train[1], num_classes)
    y_test = keras.utils.to_categorical(test[1], num_classes)

    model.compile(loss='categorical_crossentropy',
                  optimizer='adadelta',
                  metrics=['accuracy'])

    t = now()
    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1,
              validation_data=(x_test, y_test))
    print('Training time: %s' % (now() - t))

    score = model.evaluate(x_test, y_test, verbose=0)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])

In [7]:
# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# create two datasets: one with digits below 5 and one with 5 and above
x_train_lt5 = x_train[y_train < 5]
y_train_lt5 = y_train[y_train < 5]
x_test_lt5 = x_test[y_test < 5]
y_test_lt5 = y_test[y_test < 5]

x_train_gte5 = x_train[y_train >= 5]
y_train_gte5 = y_train[y_train >= 5] - 5
x_test_gte5 = x_test[y_test >= 5]
y_test_gte5 = y_test[y_test >= 5] - 5

In [8]:
# Define the "feature" layers.  These are the early layers that we expect will "transfer"
# to a new problem.  We will freeze these layers during the fine-tuning process

feature_layers = [
    Conv2D(filters, kernel_size,
           padding='valid',
           input_shape=input_shape),
    Activation('relu'),
    Conv2D(filters, kernel_size),
    Activation('relu'),
    MaxPooling2D(pool_size=pool_size),
    Dropout(0.25),
    Flatten(),
]

  super().__init__(


In [9]:
# Define the "classification" layers.  These are the later layers that predict the specific classes from the features
# learned by the feature layers.  This is the part of the model that needs to be re-trained for a new problem

classification_layers = [
    Dense(128),
    Activation('relu'),
    Dropout(0.5),
    Dense(num_classes),
    Activation('softmax')
]

In [10]:
# We create our model by combining the two sets of layers as follows
model = Sequential(feature_layers + classification_layers)

In [11]:
model.summary()

In [12]:
# Now, let's train our model on the digits 5,6,7,8,9

train_model(model,
            (x_train_gte5, y_train_gte5),
            (x_test_gte5, y_test_gte5), num_classes)

x_train shape: (29404, 28, 28, 1)
29404 train samples
4861 test samples
Epoch 1/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 937ms/step - accuracy: 0.2324 - loss: 1.6017 - val_accuracy: 0.3606 - val_loss: 1.5765
Epoch 2/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 757ms/step - accuracy: 0.2860 - loss: 1.5783 - val_accuracy: 0.4715 - val_loss: 1.5475
Epoch 3/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 707ms/step - accuracy: 0.3578 - loss: 1.5520 - val_accuracy: 0.5653 - val_loss: 1.5162
Epoch 4/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 706ms/step - accuracy: 0.4163 - loss: 1.5233 - val_accuracy: 0.6283 - val_loss: 1.4814
Epoch 5/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 702ms/step - accuracy: 0.4641 - loss: 1.4940 - val_accuracy: 0.6735 - val_loss: 1.4420
Training time: 0:15:28.479507
Test score: 1.4419806003570557
Test accuracy: 0.6735239624977112


### Freezing Layers
Keras allows layers to be "frozen" during the training process.  That is, some layers would have their weights updated during the training process, while others would not.  This is a core part of transfer learning, the ability to train just the last one or several layers.

Note also, that a lot of the training time is spent "back-propagating" the gradients back to the first layer.  Therefore, if we only need to compute the gradients back a small number of layers, the training time is much quicker per iteration.  This is in addition to the savings gained by being able to train on a smaller data set.


In [13]:
# Freeze only the feature layers
for l in feature_layers:
    l.trainable = False

In [14]:
model.summary()

In [15]:
train_model(model,
            (x_train_lt5, y_train_lt5),
            (x_test_lt5, y_test_lt5), num_classes)

x_train shape: (30596, 28, 28, 1)
30596 train samples
5139 test samples
Epoch 1/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 182ms/step - accuracy: 0.2944 - loss: 1.5786 - val_accuracy: 0.4215 - val_loss: 1.5296
Epoch 2/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 179ms/step - accuracy: 0.3705 - loss: 1.5328 - val_accuracy: 0.5421 - val_loss: 1.4809
Epoch 3/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 234ms/step - accuracy: 0.4444 - loss: 1.4870 - val_accuracy: 0.6569 - val_loss: 1.4354
Epoch 4/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 187ms/step - accuracy: 0.5166 - loss: 1.4447 - val_accuracy: 0.7577 - val_loss: 1.3902
Epoch 5/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 204ms/step - accuracy: 0.5827 - loss: 1.4039 - val_accuracy: 0.8146 - val_loss: 1.3461
Training time: 0:05:40.660087
Test score: 1.3461971282958984
Test accuracy: 0.8145553469657898


In [16]:
# Create layers and define the model as above
feature_layers2 = [
    Conv2D(filters, kernel_size,
           padding='valid',
           input_shape=input_shape),
    Activation('relu'),
    Conv2D(filters, kernel_size),
    Activation('relu'),
    MaxPooling2D(pool_size=pool_size),
    Dropout(0.25),
    Flatten(),
]

classification_layers2 = [
    Dense(128),
    Activation('relu'),
    Dropout(0.5),
    Dense(num_classes),
    Activation('softmax')
]
model2 = Sequential(feature_layers2 + classification_layers2)
model2.summary()

In [17]:
# Now, let's train our model on the digits 0,1,2,3,4
train_model(model2,
            (x_train_lt5, y_train_lt5),
            (x_test_lt5, y_test_lt5), num_classes)

x_train shape: (30596, 28, 28, 1)
30596 train samples
5139 test samples
Epoch 1/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 815ms/step - accuracy: 0.2568 - loss: 1.5910 - val_accuracy: 0.3298 - val_loss: 1.5658
Epoch 2/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 795ms/step - accuracy: 0.3206 - loss: 1.5659 - val_accuracy: 0.3977 - val_loss: 1.5379
Epoch 3/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 772ms/step - accuracy: 0.3807 - loss: 1.5396 - val_accuracy: 0.5349 - val_loss: 1.5070
Epoch 4/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 769ms/step - accuracy: 0.4341 - loss: 1.5120 - val_accuracy: 0.6425 - val_loss: 1.4720
Epoch 5/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 768ms/step - accuracy: 0.4992 - loss: 1.4780 - val_accuracy: 0.7262 - val_loss: 1.4316
Training time: 0:15:50.900094
Test score: 1.4311786890029907
Test accuracy: 0.7262113094329834


In [18]:
#Freeze layers
for l in feature_layers2:
    l.trainable = False

In [19]:
model2.summary()

In [20]:
train_model(model2,
            (x_train_gte5, y_train_gte5),
            (x_test_gte5, y_test_gte5), num_classes)

x_train shape: (29404, 28, 28, 1)
29404 train samples
4861 test samples
Epoch 1/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 196ms/step - accuracy: 0.2481 - loss: 1.6105 - val_accuracy: 0.3588 - val_loss: 1.5816
Epoch 2/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 194ms/step - accuracy: 0.2753 - loss: 1.5873 - val_accuracy: 0.3789 - val_loss: 1.5577
Epoch 3/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 193ms/step - accuracy: 0.3098 - loss: 1.5669 - val_accuracy: 0.4176 - val_loss: 1.5339
Epoch 4/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 192ms/step - accuracy: 0.3532 - loss: 1.5425 - val_accuracy: 0.4627 - val_loss: 1.5102
Epoch 5/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 194ms/step - accuracy: 0.3994 - loss: 1.5206 - val_accuracy: 0.5340 - val_loss: 1.4863
Training time: 0:03:48.996546
Test score: 1.4862498044967651
Test accuracy: 0.534046471118927
