In [1]:
import pandas as pd
import numpy as np
import scipy as sp

import keras
from keras import backend as K
from keras.datasets import mnist
from keras.models import Sequential, Model, load_model
from keras.layers import Flatten, Dense, Dropout, Lambda, Activation
from keras.optimizers import SGD, Adam
from keras.losses import categorical_crossentropy
from keras.layers.convolutional import Conv2D, MaxPooling2D, ZeroPadding2D

import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# load mnist data & normalize to 0-1
# 1D data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(-1, 28*28).astype('float32')
x_test = x_test.reshape(-1, 28*28).astype('float32')
x_train /= 255.
x_test /= 255. 

# 2D data
(x_train_2D, _), (x_test_2D, _) = mnist.load_data()
x_train_2D = x_train_2D.reshape(x_train_2D.shape[0], 1, 28, 28).astype('float32')
x_test_2D = x_test_2D.reshape(x_test_2D.shape[0], 1, 28, 28).astype('float32')
x_train_2D /= 255.
x_test_2D /= 255.

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(60000, 784) (60000, 10) (10000, 784) (10000, 10)


### Teacher Net 
1. DENSE 1200-1200-10 with HEAVY REGULARIZATION
2. CNN

In [3]:
k_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.01, seed=None)
k_constraint = keras.constraints.MaxNorm(max_value=15, axis=0)

mnist_dense = Sequential()
mnist_dense.add(Dense(1200, name='hidden_1', input_shape=(28*28, ), activation='relu', kernel_initializer=k_init, kernel_constraint=k_constraint))
mnist_dense.add(Dropout(0.7, name='dropout_1'))
mnist_dense.add(Dense(1200, name='hidden_2', activation='relu', kernel_initializer=k_init, kernel_constraint=k_constraint))
mnist_dense.add(Dropout(0.7, name='dropout_2'))
mnist_dense.add(Dense(10, name='logit'))
mnist_dense.add(Activation('softmax', name='softmax'))

mnist_dense.compile(loss=categorical_crossentropy, optimizer=Adam(0.0001), metrics=['accuracy'])

mnist_dense.fit(x_train, y_train, batch_size=100, epochs=20, verbose=1, validation_data=(x_test, y_test))

loss, accuracy = mnist_dense.evaluate(x_test, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('DENSE TECHERT NET - On test set:')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))

mnist_dense.save('./models/mnist_teacher_dense.h5')

In [4]:
mnist_dense = load_model('./models/mnist_teacher_dense.h5')
loss, accuracy = mnist_dense.evaluate(x_test, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('DENSE TECHERT NET - On test set:')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))

DENSE TECHERT NET - On test set:
loss = 0.055868843965092674, accuracy = 0.9832, #errors = 168


In [5]:
# fit a CNN teacher net
mnist_cnn = Sequential()

mnist_cnn.add(Conv2D(128, (3, 3), activation='relu', padding='same', input_shape=(1, 28, 28), name='conv_1'))
mnist_cnn.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_2'))
mnist_cnn.add(MaxPooling2D(pool_size=(2, 2), padding='same', name='pool_1'))
mnist_cnn.add(Dropout(0.25, name='dropout_1'))

mnist_cnn.add(Flatten())
mnist_cnn.add(Dense(10, name='logit'))
mnist_cnn.add(Activation('softmax', name='softmax'))

mnist_cnn.compile(loss=categorical_crossentropy, optimizer=Adam(lr=0.0005), metrics=['accuracy'])
mnist_cnn.fit(x_train_2D, y_train, batch_size=128, epochs=20, verbose=1, validation_data=(x_test_2D, y_test))

mnist_cnn.save('./models/mnist_teacher_cnn.h5')

In [6]:
mnist_cnn = load_model('./models/mnist_teacher_cnn.h5')
loss, accuracy = mnist_cnn.evaluate(x_test_2D, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('CNN TEACHER NET - On test set:')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))
print()

CNN TEACHER NET - On test set:
loss = 0.030440742732951186, accuracy = 0.9903, #errors = 97



### Student Net 1
- NO DISTILLATION BASELINE

In [7]:
def MNIST_StudentNet(n_hidden, T):
    '''
    Function to build a studnet net
    '''
    model = Sequential()
    model.add(Dense(n_hidden, name='hidden_1', input_shape=(28*28, ), activation='relu'))
    model.add(Dense(n_hidden, name='hidden_2', activation='relu'))
    model.add(Dense(10, name='logit'))
    model.add(Lambda(lambda x: x / T, name='logit_soft'))
    model.add(Activation('softmax', name='softmax'))
    model.compile(loss=categorical_crossentropy, optimizer=Adam(), metrics=['accuracy'])
    return model

In [8]:
# train baseline student net - NO distillation
mnist_student_basline = MNIST_StudentNet(n_hidden=20, T=1)
mnist_student_basline.compile(loss=categorical_crossentropy, optimizer=Adam(), metrics=['accuracy'])
mnist_student_basline.fit(x_train, y_train, batch_size=128, epochs=50, verbose=1, validation_data=(x_test, y_test))

# baseline student net model evaluation
loss, accuracy = mnist_student_basline.evaluate(x_test, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('STUDENT BASELINE - On test set:')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))

Train on 60000 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
STUDENT BASELINE - On test set:
loss = 0.15354005151256278, accuracy = 0.9632, #errors = 368


### Student Net 2
- DISTILLED --- DENSE TEACHER

In [9]:
def get_layer_output(model, layer_name):
    output = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
    return output

# compute 'soft target'
T = 3
teacher_logit = get_layer_output(mnist_dense, 'logit')
logit_train = teacher_logit.predict(x_train)
y_train_soft = K.softmax(logit_train / T).eval(session=K.get_session())

# train student net distilled from the dense teacher net
mnist_student_distilled = MNIST_StudentNet(n_hidden=20, T=T)
mnist_student_distilled.compile(loss=categorical_crossentropy, optimizer=Adam(), metrics=['accuracy'])
mnist_student_distilled.fit(x_train, y_train_soft, 
                            batch_size=128, epochs=50, verbose=1, validation_data=(x_test, y_test))

# distilled student net model evaluation
loss, accuracy = mnist_student_distilled.evaluate(x_test, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('DISTILLED STUDENT - On test set:')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))

Train on 60000 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
DISTILLED STUDENT - On test set:
loss = 0.23310526382923127, accuracy = 0.9654, #errors = 345


### Student Net 3
- DISTILLED --- DENSE TEACHER + WEIGHTED HARD/SOFT TARGET

In [10]:
def avg_mix_loss(y_true, y_pred, w, T):    
    # split hard & soft targets
    y_hard, y_soft = y_true[:, :10], y_true[:, 10:]
    
    # convert logits to predicted values
    y_hard_pred = K.softmax(y_pred) # hard target
    y_soft_pred = K.softmax(y_pred / T) # soft target
    
    # compute weighted avg of the 2 parts of losses
    avg_loss = w * categorical_crossentropy(y_hard, y_hard_pred) + categorical_crossentropy(y_soft, y_soft_pred)
    
    return avg_loss

In [15]:
n_hidden = 20
T = 3
w = 0.7 / (T**2)

# apply both hard & soft targets to learn
logit_test = teacher_logit.predict(x_test)
y_test_soft = K.softmax(logit_test / T).eval(session=K.get_session())
y_hard_soft_train = np.concatenate((y_train, y_train_soft), axis=1)
y_hard_soft_test = np.concatenate((y_test, y_test_soft), axis=1)

# fit the student net distilled from the dense teacher net with the hard-soft weighted avg loss
mnist_student_mix = Sequential()
mnist_student_mix.add(Dense(n_hidden, name='hidden_1', input_shape=(28*28, ), activation='relu'))
mnist_student_mix.add(Dense(n_hidden, name='hidden_2', activation='relu'))
mnist_student_mix.add(Dense(10, name='logit'))

# y_pred at the end of 10-node dense layer is the logit_pred
mnist_student_mix.compile(loss=lambda y_true, y_pred: avg_mix_loss(y_true, y_pred, w, T), 
                          optimizer=Adam(), metrics=['accuracy'])

mnist_student_mix.fit(x_train, y_hard_soft_train, 
                      batch_size=100, epochs=50, verbose=1, validation_data=(x_test, y_hard_soft_test))


# distilled student net with mix-hard-soft loss model evaluation
loss, accuracy = mnist_student_mix.evaluate(x_test, y_hard_soft_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('DISTILLED STUDENT with WEIGHTED HARD/SOFT TARGET - On test set:')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))

Train on 60000 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
DISTILLED STUDENT with WEIGHTED HARD/SOFT TARGET - On test set:
loss = 0.6234446800231933, accuracy = 0.9676, #errors = 323


### Student Net 4
- DISTILLED --- CNN TEACHER

In [17]:
# compute 'soft target' of the CNN teacher net
T = 3
teacher_logit_cnn = get_layer_output(mnist_cnn, 'logit')
logit_train_cnn = teacher_logit_cnn.predict(x_train_2D)
y_train_soft_cnn = K.softmax(logit_train_cnn / T).eval(session=K.get_session())

# train student net distilled from the CNN teacher net
mnist_student_distilled_cnn = MNIST_StudentNet(n_hidden=20, T=T)
mnist_student_distilled_cnn.fit(x_train, y_train_soft_cnn, 
                                batch_size=128, epochs=50, verbose=1, validation_data=(x_test, y_test))

# student net model evalutation
loss, accuracy = mnist_student_distilled_cnn.evaluate(x_test, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('DISTILLED STUDENT from CNN TEACHER - On test set:')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))

Train on 60000 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
DISTILLED STUDENT from CNN TEACHER - On test set:
loss = 0.1805808451652527, accuracy = 0.966, #errors = 340


### Summary  performances of all distilled student nets

In [2]:
student_summary = {
    'baseline': {'loss':0.15354005151256278, 'accuracy': 0.9632, 'num_error': 368},
    'dense_distilled': {'loss': 0.23310526382923127, 'accuracy': 0.9654, 'num_error': 345},
    'dense_distilled_mix_loss': {'loss': 0.6234446800231933, 'accuracy': 0.9676, 'num_error': 323},
    'cnn_distilled': {'loss': 0.1805808451652527, 'accuracy': 0.966, 'num_error': 340}
}

print('=== Student Net Performance on Full Test Set ===')
df_student_summary = pd.DataFrame().from_dict(student_summary).T
display(df_student_summary)

=== Student Net Performance on Full Test Set ===


Unnamed: 0,accuracy,loss,num_error
baseline,0.9632,0.15354,368.0
cnn_distilled,0.966,0.180581,340.0
dense_distilled,0.9654,0.233105,345.0
dense_distilled_mix_loss,0.9676,0.623445,323.0


---

## Experiment with omitting digit 3 in the transfer set

In [18]:
idx = (y_train[:,3] == 1)
x_train_omit3 = x_train[~idx,:]
y_train_omit3 = y_train[~idx,:]
y_train_soft_omit3 = y_train_soft[~idx,:]

In [19]:
# student net - NO distillation - digit 3 omitted in the transfer set
hard_omit3 = MNIST_StudentNet(n_hidden=20, T=1)
hard_omit3.fit(x_train_omit3, y_train_omit3, 
               batch_size=128, epochs=50, verbose=1, validation_data=(x_test, y_test))

Train on 53869 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x13afab240>

In [20]:
# student net - WITH distillation - digit 3 omitted in the transfer set
soft_omit3 = MNIST_StudentNet(n_hidden=20, T=3)
soft_omit3.fit(x_train_omit3, y_train_soft_omit3, 
               batch_size=128, epochs=50, verbose=1, validation_data=(x_test, y_test))

Train on 53869 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1433e84e0>

In [32]:
n_hidden = 20
T = 5
w = 0.5 / (T**2) 

# apply both hard & soft targets to learn
y_hard_soft_train_omit3 = np.concatenate((y_train_omit3, y_train_soft_omit3), axis=1)

# student net - WITH distillation & weighted hard soft loss - digit 3 omitted in the transfer set
mix_omit3 = Sequential()
mix_omit3.add(Dense(n_hidden, name='hidden_1', input_shape=(28*28, ), activation='relu'))
mix_omit3.add(Dense(n_hidden, name='hidden_2', activation='relu'))
mix_omit3.add(Dense(10, name='logit'))

# y_pred at the end of 10-node dense layer is the logit_pred
mix_omit3.compile(loss=lambda y_true, y_pred: avg_mix_loss(y_true, y_pred, w, T), 
                  optimizer=Adam(), metrics=['accuracy'])

mix_omit3.fit(x_train_omit3, y_hard_soft_train_omit3, 
              batch_size=128, epochs=50, verbose=1, validation_data=(x_test, y_hard_soft_test))


Train on 53869 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1525cce80>

In [33]:
print('=== Overall Accuracy on Test set === \n')

loss, accuracy = hard_omit3.evaluate(x_test, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('NO DISTILLATION')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))
print()

print('WITH DISTILLATION')
loss, accuracy = soft_omit3.evaluate(x_test, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))
print()

print('WITH DISTILLATION & WEIGHTED HARD-SOFT TARGET')
loss, accuracy = mix_omit3.evaluate(x_test, y_hard_soft_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))


=== Overall Accuracy on Test set === 

NO DISTILLATION
loss = 1.76703179179905, accuracy = 0.8678, #errors = 1321

WITH DISTILLATION
loss = 0.3264509074926376, accuracy = 0.9179, #errors = 820

WITH DISTILLATION & WEIGHTED HARD-SOFT TARGET
loss = 0.6823878009796143, accuracy = 0.93, #errors = 699


In [34]:
print('=== Digit-3 Only Accuracy on Test set === \n')

idx2 = y_test[:,3] == 1
x_test_3 = x_test[idx2, :]
y_test_3 = y_test[idx2, :]
y_test_soft_3 = y_test[idx2,:]
y_hard_soft_test_3 = np.concatenate((y_test_3, y_test_soft_3), axis=1)

loss, accuracy = hard_omit3.evaluate(x_test_3, y_test_3, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('NO DISTILLATION')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))
print()

loss, accuracy = soft_omit3.evaluate(x_test_3, y_test_3, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('WITH DISTILLATION')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))
print()

loss, accuracy = mix_omit3.evaluate(x_test_3, y_hard_soft_test_3, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('WITH DISTILLATION & WEIGHTED HARD-SOFT TARGET')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))


=== Digit-3 Only Accuracy on Test set === 

NO DISTILLATION
loss = 16.111788532521466, accuracy = 0.0, #errors = 10000

WITH DISTILLATION
loss = 1.2265173893163699, accuracy = 0.4623762377417914, #errors = 5376

WITH DISTILLATION & WEIGHTED HARD-SOFT TARGET
loss = 1.1009060543362457, accuracy = 0.5722772281948882, #errors = 4277


In [36]:
# tune bias
soft_omit3_bias = MNIST_StudentNet(n_hidden=20, T=3)
bias = soft_omit3_bias.layers[2].get_weights()[1]
bias[3] = 3.5
K.set_value(soft_omit3_bias.layers[2].bias, bias)

soft_omit3_bias.fit(x_train_omit3, y_train_soft_omit3, 
                    batch_size=128, epochs=50, verbose=1, validation_data=(x_test, y_test))

Train on 53869 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x153fe1240>

In [47]:
print('=== BIAS TUNED - Overall Accuracy on Test set === \n')
loss, accuracy = soft_omit3_bias.evaluate(x_test, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))
print()

print('=== BIAS TUNED - Digit-3 Only Accuracy on Test set === \n')
loss, accuracy = soft_omit3_bias.evaluate(x_test_3, y_test_3, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))


=== BIAS TUNED - Overall Accuracy on Test set === 

loss = 0.32033734719753265, accuracy = 0.9229, #errors = 770

=== BIAS TUNED - Digit-3 Only Accuracy on Test set === 

loss = 1.1231612833419649, accuracy = 0.5257425743459475, #errors = 4742


### Summary performances of distilled student nets which have NEVER seen digit 3

In [3]:
omit3_summary_overall = {
    'baseline': {'loss': 1.76703179179905, 'accuracy': 0.8678, 'num_error': 1321},
    'dense_distilled': {'loss': 0.3264509074926376, 'accuracy': 0.9179, 'num_error': 820},
    'dense_distilled_mix_loss': {'loss': 0.6823878009796143, 'accuracy': 0.93, 'num_error': 699},
    'bias_tuned': {'loss': 0.32033734719753265, 'accuracy': 0.9229, 'num_error': 770}
}
omit3_summary_3only = {
    'baseline': {'loss': 16.111788532521466, 'accuracy': 0.0, 'num_error': 10000},
    'dense_distilled': {'loss': 1.2265173893163699, 'accuracy': 0.4623762377417914, 'num_error': 5376},
    'dense_distilled_mix_loss': {'loss': 1.1009060543362457, 'accuracy': 0.5722772281948882, 'num_error': 4277},
    'bias_tuned': {'loss': 1.1231612833419649, 'accuracy': 0.5257425743459475, 'num_error': 4742}
}

df_omit3_summary_overall = pd.DataFrame().from_dict(omit3_summary_overall).T
df_omit3_summary_3only = pd.DataFrame().from_dict(omit3_summary_3only).T

print('=== Omit3 Distillation Performance on Full Test Set ===')
display(df_omit3_summary_overall)
print()

print('=== Omit3 Distillation Performance on Digit 3 Test Samples ===')
display(df_omit3_summary_3only)
print()

=== Omit3 Distillation Performance on Full Test Set ===


Unnamed: 0,accuracy,loss,num_error
baseline,0.8678,1.767032,1321.0
bias_tuned,0.9229,0.320337,770.0
dense_distilled,0.9179,0.326451,820.0
dense_distilled_mix_loss,0.93,0.682388,699.0



=== Omit3 Distillation Performance on Digit 3 Test Samples ===


Unnamed: 0,accuracy,loss,num_error
baseline,0.0,16.111789,10000.0
bias_tuned,0.525743,1.123161,4742.0
dense_distilled,0.462376,1.226517,5376.0
dense_distilled_mix_loss,0.572277,1.100906,4277.0





---

## Experiment with only keeping digit 7 and 8 in the transfer set

In [38]:
# keep 7 and 8 in transfer set
idx78 = [True if y[7]==1 or y[8] == 1 else False for y in y_train]

x_train_78 = x_train[idx78,:]
y_train_soft_78 = y_train_soft[idx78,:]
y_train_78 = y_train[idx78,:]

In [39]:
# train hard target model with training set omitting digit 3
hard_78 = MNIST_StudentNet(n_hidden=20, T=1)
hard_78.fit(x_train_78, y_train_78, batch_size=128, epochs=50, verbose=1, validation_data=(x_test, y_test))

Train on 12116 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x153fbe400>

In [40]:
# train soft target model with training set omitting digit 3
soft_78 = MNIST_StudentNet(n_hidden=20, T=3)
soft_78.fit(x_train_78, y_train_soft_78, batch_size=128, epochs=100, verbose=1, validation_data=(x_test, y_test))

Train on 12116 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/1

<keras.callbacks.History at 0x15681ada0>

In [43]:
n_hidden = 20
T = 5
w = 0.5 / (T**2) 

# apply both hard & soft targets to learn
y_hard_soft_train_78 = np.concatenate((y_train_78, y_train_soft_78), axis=1)

# student net - WITH distillation & weighted hard soft loss - digit 3 omitted in the transfer set
mix_78 = Sequential()
mix_78.add(Dense(n_hidden, name='hidden_1', input_shape=(28*28, ), activation='relu'))
mix_78.add(Dense(n_hidden, name='hidden_2', activation='relu'))
mix_78.add(Dense(10, name='logit'))

# y_pred at the end of 10-node dense layer is the logit_pred
mix_78.compile(loss=lambda y_true, y_pred: avg_mix_loss(y_true, y_pred, w, T), 
               optimizer=Adam(), metrics=['accuracy'])

mix_78.fit(x_train_78, y_hard_soft_train_78, 
           batch_size=128, epochs=100, verbose=1, validation_data=(x_test, y_hard_soft_test))


Train on 12116 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/1

<keras.callbacks.History at 0x15519a668>

In [44]:
soft_78_bias = MNIST_StudentNet(n_hidden=20, T=3)
bias = soft_78_bias.layers[2].get_weights()[1]
bias[7] = 1
bias[8] = 1
K.set_value(soft_78_bias.layers[2].bias, bias)

soft_78_bias.fit(x_train_78, y_train_soft_78, 
                 batch_size=128, epochs=100,verbose=1, validation_data=(x_test, y_test))

Train on 12116 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/1

<keras.callbacks.History at 0x1591e5f60>

In [48]:
print('=== Overall Accuracy on Test set === \n')

loss, accuracy = hard_78.evaluate(x_test, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('NO DISTILLATION')
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))
print()

print('WITH DISTILLATION')
loss, accuracy = soft_78.evaluate(x_test, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))
print()

print('WITH DISTILLATION & WEIGHTED HARD-SOFT TARGET')
loss, accuracy = mix_78.evaluate(x_test, y_hard_soft_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))
print()

print('WITH DISTILLATION & BIAS TUNED')
loss, accuracy = soft_78_bias.evaluate(x_test, y_test, verbose=0)
num_errors = int((1 - accuracy) * len(x_test))
print('loss = {}, accuracy = {}, #errors = {}'.format(loss, accuracy, num_errors))


=== Overall Accuracy on Test set === 

NO DISTILLATION
loss = 11.931765022277832, accuracy = 0.1986, #errors = 8014

WITH DISTILLATION
loss = 1.07515452003479, accuracy = 0.6182, #errors = 3818

WITH DISTILLATION & WEIGHTED HARD-SOFT TARGET
loss = 1.2918906003952026, accuracy = 0.6241, #errors = 3759

WITH DISTILLATION & BIAS TUNED
loss = 1.050957187271118, accuracy = 0.6302, #errors = 3698


### Summary performaces of distilled student nets which have ONLY SEEN digit 7 & 8

In [4]:
keep78_summary = {
    'baseline': {'loss': 11.931765022277832, 'accuracy': 0.1986, 'num_error': 8014},
    'dense_distilled': {'loss': 1.07515452003479, 'accuracy': 0.6182, 'num_error': 3818},
    'dense_distilled_mix_loss': {'loss': 1.2918906003952026, 'accuracy': 0.6241, 'num_error': 3759},
    'bias_tuned': {'loss': 1.050957187271118, 'accuracy': 0.6302, 'num_error': 3698}
}

print('=== Only 7 & 8 Distillation Performance on Full Test Set ===')
df_keep78_summary = pd.DataFrame().from_dict(keep78_summary).T
display(df_keep78_summary)

=== Only 7 & 8 Distillation Performance on Full Test Set ===


Unnamed: 0,accuracy,loss,num_error
baseline,0.1986,11.931765,8014.0
bias_tuned,0.6302,1.050957,3698.0
dense_distilled,0.6182,1.075155,3818.0
dense_distilled_mix_loss,0.6241,1.291891,3759.0
