In [None]:
# Author: Naveen Lalwani
# Script to distill knowledge of LeNet-300-100 model trained on MNIST to student model

import tensorflow as tf
import numpy as np
import keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.models import Model
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import RMSprop, SGD, Adam

In [None]:
# Preprocessing for smaller model
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(-1, 784)
x_test = x_test.reshape(-1, 784)

# Normalization
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

# One Hot Encoding
y_train = to_categorical(y_train.astype('float32'))
y_test = to_categorical(y_test.astype('float32'))

In [None]:
# Teacher Model: LeNet-300-100
def lenet_300_100_model():
    inputs = layers.Input(shape = (784,))
    
    x = layers.Dense(300, activation='relu', name='FC1')(inputs)
    
    x = layers.Dense(100, activation='relu', name='FC2')(x)

    x = layers.Dense(10, name='logits')(x)
    preds = layers.Activation('softmax', name='Softmax')(x)

    model = Model(inputs=inputs, outputs=preds)
    model.summary()
    return model

#**Build Model LeNet-300-100**

In [62]:
model = lenet_300_100_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 784)               0         
_________________________________________________________________
FC1 (Dense)                  (None, 300)               235500    
_________________________________________________________________
FC2 (Dense)                  (None, 100)               30100     
_________________________________________________________________
logits (Dense)               (None, 10)                1010      
_________________________________________________________________
Softmax (Activation)         (None, 10)                0         
Total params: 266,610
Trainable params: 266,610
Non-trainable params: 0
_________________________________________________________________


In [63]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.fit(x_train, y_train, epochs=5, batch_size = 512) 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f1f42cd3c18>

In [64]:
test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

Test Loss: 0.08111305869612843
Test Accuracy: 0.9759


In [None]:
getSoftmaxKnowledge = Model(inputs=model.input, outputs=model.get_layer("logits").output)
model_logits = getSoftmaxKnowledge.predict(x_train)

In [None]:
# Defining function described by Geoffrey Hinton in his paper of Knowledge Distillation
def softmax_with_temperature(logits, temperature):
    logits = logits / temperature
    return (np.exp(logits) / np.sum(np.exp(logits)))

In [None]:
# Temperature is a hyperparameter
temperature = 3
softened_train_prob = softmax_with_temperature(model_logits, temperature)

In [None]:
# Model Definition for the Student Model
def build_small_model():
    inputs = layers.Input(shape = (784,))
    
    x = layers.Dense(50, activation='relu', name='FC1')(inputs)
    
    x = layers.Dense(10, name='logits')(x)
    
    preds = layers.Activation('softmax', name='Softmax')(x)
  
    model = Model(inputs=inputs, outputs=preds)
    model.summary()
    return model

In [123]:
small_model = build_small_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_21 (InputLayer)        (None, 784)               0         
_________________________________________________________________
FC1 (Dense)                  (None, 50)                39250     
_________________________________________________________________
logits (Dense)               (None, 10)                510       
_________________________________________________________________
Softmax (Activation)         (None, 10)                0         
Total params: 39,760
Trainable params: 39,760
Non-trainable params: 0
_________________________________________________________________


# **Distilling Knowledge in the student model**

In [125]:
# Optimization = Adam
# Loss = Cross Entropy loss
# Epochs = 50
# Trained with dark knowledge

small_model.compile(optimizer='adam', loss= 'categorical_crossentropy', metrics=['categorical_accuracy'])
small_model.fit(x_train, softened_train_prob, epochs=50, batch_size=128)

test_loss, test_acc = small_model.evaluate(x_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 0.32816381804943084
Test Accuracy: 0.9676


In [None]:
small_model.save('model_50_LeNet-300-100_Distilled.h5')
model.save('model_LeNet-300-100.h5')