In [1]:
import numpy as np
import scipy as sp

import keras
from keras import backend as K
from keras.datasets import mnist
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Dropout, Lambda, Activation
from keras.optimizers import SGD, Adam
from keras.losses import categorical_crossentropy

import matplotlib.pyplot as plt
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# load mnist data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

# normalize to 0-1
x_train /= 255.
x_test /= 255.

# flatten training & test data to 1D
img_rows, img_cols = 28, 28
input_dim = img_rows * img_cols
x_train = x_train.reshape(-1, input_dim)
x_test = x_test.reshape(-1, input_dim)
input_shape = (input_dim, )

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(60000, 784) (60000, 10) (10000, 784) (10000, 10)


### Teacher Net: 
- 1200-1200-10
- heavily regularized

In [3]:
# k_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.01, seed=None)
# k_constraint = keras.constraints.MaxNorm(max_value=15, axis=0)

# mnist_teacher = Sequential()
# mnist_teacher.add(Dense(1200, name='hidden_1', input_shape=input_shape, activation='relu', kernel_initializer=k_init, kernel_constraint=k_constraint))
# mnist_teacher.add(Dropout(0.7, name='dropout_1'))
# mnist_teacher.add(Dense(1200, name='hidden_2', activation='relu', kernel_initializer=k_init, kernel_constraint=k_constraint))
# mnist_teacher.add(Dropout(0.7, name='dropout_2'))
# mnist_teacher.add(Dense(10, name='logit'))
# mnist_teacher.add(Activation('softmax', name='softmax'))

# mnist_teacher.compile(loss=categorical_crossentropy,
#                       optimizer=Adam(0.0001),
# #                       optimizer=SGD(lr=0.0001, momentum=0.9, decay=0.002),
#                       metrics=['accuracy'])

# mnist_teacher.fit(x_train, y_train, batch_size=100, epochs=20, verbose=1, validation_data=(x_test, y_test))

# score = mnist_teacher.evaluate(x_test, y_test, verbose=0)
# print('Test loss:', score[0])
# print('Test accuracy:', score[1])
# print('# Errors on Test set: ', int((1-score[1])*len(x_test)))

Train on 60000 samples, validate on 10000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.055868843965092674
Test accuracy: 0.9832
# Errors on Test set:  168


In [4]:
mnist_teacher.save('./models/mnist_teacher.h5')

In [5]:
mnist_teacher = load_model('./models/mnist_teacher.h5')
score = mnist_teacher.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('# Errors on Test set: ', int((1-score[1])*len(x_test)))

Test loss: 0.055868843965092674
Test accuracy: 0.9832
# Errors on Test set:  168


### Student Net 1: 
- 800-800-10
- no regularization
- T = 1

In [9]:
T = 1

mnist_student = Sequential()
mnist_student.add(Dense(20, name='hidden_1', input_shape=input_shape, activation='relu'))
mnist_student.add(Dense(20, name='hidden_2', activation='relu'))
mnist_student.add(Dense(10, name='logit'))
mnist_student.add(Lambda(lambda x: x / T, name='logit_soft'))
mnist_student.add(Activation('softmax', name='softmax'))

mnist_student.compile(loss=categorical_crossentropy,
                      optimizer=Adam(),
                      metrics=['accuracy'])

mnist_student.fit(x_train, y_train, batch_size=100, epochs=50, verbose=1, validation_data=(x_test, y_test))

score = mnist_student.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('# Errors on Test set: ', int((1-score[1])*len(x_test)))

Train on 60000 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test loss: 0.17114703205557308
Test accuracy: 0.9593
# Errors on Test set:  406


### Student Net 2: 
- 800-800-10
- no regularization
- T = 20
- learning the same temporature adjusted soft target from the teacher net

In [12]:
def get_layer_output(model, layer_name):
    output = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
    return output

model_logit = get_layer_output(mnist_teacher, 'logit')
logit_train = model_logit.predict(x_train)
logit_test = model_logit.predict(x_test)

In [13]:
# compute the soft target from the teacher network
T = 3

# exp_logit_train = np.exp(logit_train / T)
# y_train_soft = exp_logit_train / exp_logit_train.sum(axis=1).reshape(-1, 1)

y_train_soft = K.softmax(logit_train / T).eval(session=K.get_session())

mnist_student_soft = Sequential()
mnist_student_soft.add(Dense(20, name='hidden_1', input_shape=input_shape, activation='relu'))
mnist_student_soft.add(Dense(20, name='hidden_2', activation='relu'))
mnist_student_soft.add(Dense(10, name='logit'))
mnist_student_soft.add(Lambda(lambda x: x / T, name='logit_soft'))
mnist_student_soft.add(Activation('softmax', name='softmax'))

mnist_student_soft.compile(loss=categorical_crossentropy,
                           optimizer=Adam(),
                           metrics=['accuracy'])

mnist_student_soft.fit(x_train, y_train_soft, 
                       batch_size=100, epochs=50, verbose=1, validation_data=(x_test, y_test))

score = mnist_student_soft.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('# Errors on Test set: ', int((1-score[1])*len(x_test)))

Train on 60000 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test loss: 0.2311389872312546
Test accuracy: 0.9656
# Errors on Test set:  343


### Student Net 3: 
- 800-800-10
- no regularization
- T = 20
- learning the same temporature adjusted soft target from the teacher net
- using weighted avg cross entropy loss of both hard & soft target

In [10]:
y_train_soft = K.softmax(logit_train / T).eval(session=K.get_session())
y_test_soft = K.softmax(logit_test / T).eval(session=K.get_session())
y_train_soft.shape, y_test_soft.shape

((60000, 10), (10000, 10))

In [11]:
T = 5
w = 1 / (T**2)

def avg_mix_loss(y_true, y_pred):    
    # split hard & soft targets
    y_hard, y_soft = y_true[:, :10], y_true[:, 10:]
    
    # convert logits to predicted values
    y_hard_pred = K.softmax(y_pred) # hard target
    y_soft_pred = K.softmax(y_pred / T) # soft target
    
    # compute weighted avg of the 2 parts of losses
    avg_loss = w * categorical_crossentropy(y_hard, y_hard_pred) + categorical_crossentropy(y_soft, y_soft_pred)
    
    return avg_loss

mnist_student_mix = Sequential()
mnist_student_mix.add(Dense(20, name='hidden_1', input_shape=input_shape, activation='relu'))
mnist_student_mix.add(Dense(20, name='hidden_2', activation='relu'))
mnist_student_mix.add(Dense(10, name='logit'))

# y_pred at the end of 10-node dense layer is the logit_pred
mnist_student_mix.compile(loss=lambda y_true, y_pred: avg_mix_loss(y_true, y_pred), 
                          optimizer=Adam(),
                          metrics=['accuracy'])

# apply both hard & soft targets to learn
y_hard_soft_train = np.concatenate((y_train, y_train_soft), axis=1)
y_hard_soft_test = np.concatenate((y_test, y_test_soft), axis=1)

mnist_student_mix.fit(x_train, y_hard_soft_train, 
                      batch_size=100, epochs=50, verbose=1, validation_data=(x_test, y_hard_soft_test))

score = mnist_student_mix.evaluate(x_test, y_hard_soft_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('# Errors on Test set: ', int((1-score[1])*len(x_test)))


Train on 60000 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test loss: 0.11989445998165757
Test accuracy: 0.9634
# Errors on Test set:  365
