<a href="https://colab.research.google.com/github/Mark-Barbaric/IBM_Machine_Learning_Certificate/blob/dl_wee5/Deep_Learning_and_Reinforcement_Learning/week5/transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<p style="text-align:center">
    <a href="https://skills.network" target="_blank">
    <img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/assets/logos/SN_web_lightmode.png" width="200" alt="Skills Network Logo">
    </a>
</p>


# Machine Learning Foundation

## Course 5, Part g: Transfer Learning DEMO


For this exercise, we will use the well-known MNIST digit data. To illustrate the power and concept of transfer learning, we will train a CNN on just the digits 5,6,7,8,9.  Then we will train just the last layer(s) of the network on the digits 0,1,2,3,4 and see how well the features learned on 5-9 help with classifying 0-4.




In [23]:
import datetime
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
#from tensorflow import keras
#from tensorflow.keras.datasets import mnist
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
#from tensorflow.keras.layers import Conv2D, MaxPooling2D
#from tensorflow.keras import backend as K

In [24]:
now = datetime.datetime.now

In [25]:
batch_size = 128
num_classes = 5
epochs = 5

In [26]:
img_rows, img_cols = 28, 28
filters = 32
pool_size = 2
kernel_size = 3

In [27]:
if K.image_data_format() == "channels_first":
  input_shape = (1, img_rows, img_cols)
else:
  input_shape = (img_rows, img_cols, 1)

In [28]:
def train_model(model, train, test, num_classes):
  x_train = train[0].reshape((train[0].shape[0],) + input_shape)
  x_test = test[0].reshape((test[0].shape[0],) + input_shape)
  x_train = x_train.astype('float')
  x_test = x_test.astype('float')
  x_train /= 255
  x_test /= 255

  print('x_train shape:', x_train.shape)
  print(x_train.shape[0], 'train samples')
  print(x_test.shape[0], 'test samples')

  # convert class vectors to binary class matrices
  y_train = keras.utils.to_categorical(train[1], num_classes)
  y_test = keras.utils.to_categorical(test[1], num_classes)

  model.compile(loss='categorical_crossentropy',
                optimizer='adadelta',
                metrics=['accuracy'])

  t = now()
  model.fit(x_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            verbose=1,
            validation_data=(x_test, y_test))
  print('Training time: %s' % (now() - t))

  score = model.evaluate(x_test, y_test, verbose=0)
  print('Test score:', score[0])
  print('Test accuracy:', score[1])

In [29]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [30]:
x_train_lt5 = x_train[y_train < 5]
y_train_lt5 = y_train[y_train < 5]
x_test_lt5 = x_test[y_test < 5]
y_test_lt5 = y_test[y_test < 5]

In [31]:
x_train_gte5 = x_train[y_train >= 5]
y_train_gte5 = y_train[y_train >= 5] - 5
x_test_gte5 = x_test[y_test >= 5]
y_test_gte5 = y_test[y_test >= 5] - 5

In [32]:
feature_layers = [
    Conv2D(
        filters,
        kernel_size,
        input_shape=input_shape
    ),
    Activation('relu'),
    Conv2D(
        filters,
        kernel_size
    ),
    Activation('relu'),
    MaxPooling2D(
        pool_size=pool_size
    ),
    Dropout(0.25),
    Flatten()
]

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [33]:
classification_layers = [
    Dense(128),
    Activation('relu'),
    Dropout(0.5),
    Dense(num_classes),
    Activation('softmax')
]

In [34]:
model = Sequential(
    feature_layers + classification_layers
)

In [35]:
model.summary()

In [36]:
train_model(
    model,
    (x_train_gte5, y_train_gte5),
    (x_test_gte5, y_test_gte5),
    num_classes
)

x_train shape: (29404, 28, 28, 1)
29404 train samples
4861 test samples
Epoch 1/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 222ms/step - accuracy: 0.2028 - loss: 1.6132 - val_accuracy: 0.2343 - val_loss: 1.5961
Epoch 2/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 210ms/step - accuracy: 0.2380 - loss: 1.5976 - val_accuracy: 0.3197 - val_loss: 1.5781
Epoch 3/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 202ms/step - accuracy: 0.2836 - loss: 1.5805 - val_accuracy: 0.4149 - val_loss: 1.5594
Epoch 4/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 202ms/step - accuracy: 0.3407 - loss: 1.5619 - val_accuracy: 0.5102 - val_loss: 1.5390
Epoch 5/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 204ms/step - accuracy: 0.3930 - loss: 1.5430 - val_accuracy: 0.5931 - val_loss: 1.5160
Training time: 0:06:17.793810
Test score: 1.515951156616211
Test accuracy: 0.5930878520011902


### Freezing Layers
Keras allows layers to be "frozen" during the training process.  That is, some layers would have their weights updated during the training process, while others would not.  This is a core part of transfer learning, the ability to train just the last one or several layers.

Note also, that a lot of the training time is spent "back-propagating" the gradients back to the first layer.  Therefore, if we only need to compute the gradients back a small number of layers, the training time is much quicker per iteration.  This is in addition to the savings gained by being able to train on a smaller data set.


In [37]:
for l in feature_layers:
  l.trainable = False

In [38]:
model.summary()

In [39]:
train_model(model,
            (x_train_lt5, y_train_lt5),
            (x_test_lt5, y_test_lt5), num_classes)

x_train shape: (30596, 28, 28, 1)
30596 train samples
5139 test samples
Epoch 1/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 79ms/step - accuracy: 0.2886 - loss: 1.5949 - val_accuracy: 0.4055 - val_loss: 1.5684
Epoch 2/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 73ms/step - accuracy: 0.3493 - loss: 1.5668 - val_accuracy: 0.5114 - val_loss: 1.5389
Epoch 3/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 74ms/step - accuracy: 0.4106 - loss: 1.5401 - val_accuracy: 0.5986 - val_loss: 1.5092
Epoch 4/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 77ms/step - accuracy: 0.4767 - loss: 1.5124 - val_accuracy: 0.6589 - val_loss: 1.4785
Epoch 5/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 74ms/step - accuracy: 0.5382 - loss: 1.4826 - val_accuracy: 0.7034 - val_loss: 1.4473
Training time: 0:01:42.210869
Test score: 1.4472777843475342
Test accuracy: 0.703444242477417


Note that after a single epoch, we are already achieving results on classifying 0-4 that are comparable to those achieved on 5-9 after 5 full epochs.  This despite the fact the we are only "fine-tuning" the last layer of the network, and all the early layers have never seen what the digits 0-4 look like.

Also, note that even though nearly all (590K/600K) of the *parameters* were trainable, the training time per epoch was still much reduced.  This is because the unfrozen part of the network was very shallow, making backpropagation faster.


## Exercise
- Now we will write code to reverse this training process.  That is, train on the digits 0-4, then finetune only the last layers on the digits 5-9.


In [40]:
feature_layers2 = [
    Conv2D(
        filters,
        kernel_size,
        padding='valid',
        input_shape=input_shape
    ),
    Activation('relu'),
    Conv2D(
        filters,
        kernel_size
    ),
    Activation('relu'),
    MaxPooling2D(
        pool_size=pool_size
    ),
    Dropout(0.25),
    Flatten()
]

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [41]:
classification_layers2 = [
    Dense(128),
    Activation('relu'),
    Dropout(0.5),
    Dense(num_classes),
    Activation('softmax')
]

In [42]:
model2 = Sequential(feature_layers2 + classification_layers2)
model2.summary()

In [43]:
train_model(model2,
            (x_train_lt5, y_train_lt5),
            (x_test_lt5, y_test_lt5), num_classes)

x_train shape: (30596, 28, 28, 1)
30596 train samples
5139 test samples
Epoch 1/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 210ms/step - accuracy: 0.2500 - loss: 1.6003 - val_accuracy: 0.4287 - val_loss: 1.5695
Epoch 2/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 207ms/step - accuracy: 0.3451 - loss: 1.5685 - val_accuracy: 0.5133 - val_loss: 1.5347
Epoch 3/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 201ms/step - accuracy: 0.4319 - loss: 1.5362 - val_accuracy: 0.6260 - val_loss: 1.4964
Epoch 4/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 206ms/step - accuracy: 0.5098 - loss: 1.4983 - val_accuracy: 0.7182 - val_loss: 1.4527
Epoch 5/5
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 200ms/step - accuracy: 0.5715 - loss: 1.4589 - val_accuracy: 0.7856 - val_loss: 1.4018
Training time: 0:04:40.914432
Test score: 1.4018278121948242
Test accuracy: 0.7855613827705383


In [44]:
for l in feature_layers2:
    l.trainable = False

In [45]:
model2.summary()

In [46]:
train_model(model2,
            (x_train_gte5, y_train_gte5),
            (x_test_gte5, y_test_gte5), num_classes)

x_train shape: (29404, 28, 28, 1)
29404 train samples
4861 test samples
Epoch 1/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 92ms/step - accuracy: 0.2787 - loss: 1.5889 - val_accuracy: 0.4230 - val_loss: 1.5555
Epoch 2/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 73ms/step - accuracy: 0.3330 - loss: 1.5577 - val_accuracy: 0.4939 - val_loss: 1.5233
Epoch 3/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 79ms/step - accuracy: 0.3865 - loss: 1.5277 - val_accuracy: 0.5631 - val_loss: 1.4918
Epoch 4/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 74ms/step - accuracy: 0.4251 - loss: 1.5031 - val_accuracy: 0.6272 - val_loss: 1.4612
Epoch 5/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 73ms/step - accuracy: 0.4879 - loss: 1.4732 - val_accuracy: 0.6762 - val_loss: 1.4308
Training time: 0:01:40.777884
Test score: 1.4307806491851807
Test accuracy: 0.6761983036994934
