<a href="https://colab.research.google.com/github/Gradsmith/Deep-Learning-HW3/blob/main/DL2022_HW3_P2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input

try:
    from classification_models.tfkeras import Classifiers
except:
    !pip install image-classifiers
    from classification_models.tfkeras import Classifiers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting image-classifiers
  Downloading image_classifiers-1.0.0-py3-none-any.whl (19 kB)
Collecting keras-applications<=1.0.8,>=1.0.7
  Downloading Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 5.7 MB/s 
Installing collected packages: keras-applications, image-classifiers
Successfully installed image-classifiers-1.0.0 keras-applications-1.0.8


# Part A)

In [3]:
# load a pretrained resnet50 model on imagenet and detach the FC layer
resnet50_pretrained = ResNet50(input_shape=(32,32,3), weights='imagenet', include_top=False)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [4]:
# freeze the resnet50 conv layers to avoid weight changes during the linear tuning
for layer in resnet50_pretrained.layers:
    layer.trainable = False

In [5]:
# add a FC layer and GA pooling and create the teacher model
model_teacher = tf.keras.Sequential()
model_teacher.add(resnet50_pretrained)
model_teacher.add(tf.keras.layers.GlobalAveragePooling2D())
model_teacher.add(tf.keras.layers.Dense(10))

In [6]:
# set the optimizer, loss and metrics
model_teacher.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['sparse_categorical_accuracy']
)

In [7]:
# prepare the cifar10 dataset and apply the resnet specific preprocessing
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

x_train = preprocess_input(x_train)
x_test = preprocess_input(x_test)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [8]:
# tuning the teacher model
model_teacher.fit(x_train, y_train, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff426039820>

In [9]:
# evaluate the teacher
loss_train, acc_train = model_teacher.evaluate(x_train, y_train, batch_size=32)
loss_test, acc_test = model_teacher.evaluate(x_test, y_test, batch_size=32)



In [10]:
print(f'Test Loss: {loss_test:.3f}')
print(f'Train Loss: {loss_train:.3f}\n')

print(f'Test Accuracy: {acc_test*100:.3f} %')
print(f'Train Accuracy: {acc_train*100:.3f} %')

Test Loss: 1.621
Train Loss: 1.011

Test Accuracy: 62.420 %
Train Accuracy: 71.044 %


# Part B)

In [11]:
# load a untrained resnet18 model and detach the FC layer
ResNet18, _ = Classifiers.get('resnet18')
resnet18 = ResNet18(input_shape=(32,32,3), weights=None, include_top=False)

In [12]:
# add a FC layer and GA pooling and create the student model
model_student = tf.keras.Sequential()
model_student.add(resnet18)
model_student.add(tf.keras.layers.GlobalAveragePooling2D())
model_student.add(tf.keras.layers.Dense(10))

# clone the student model to train from scratch in part c
model_student_scratch = tf.keras.models.clone_model(model_student)

# clone another student model to use in part d
model_student_new = tf.keras.models.clone_model(model_student)

In [13]:
# distill teacher to student with a distiller class
# code adapted from https://keras.io/examples/vision/knowledge_distillation/

class Distiller(tf.keras.Model):
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)

            # Compute scaled distillation loss from https://arxiv.org/abs/1503.02531
            # The magnitudes of the gradients produced by the soft targets scale
            # as 1/T^2, multiply them by T^2 when using both hard and soft targets.
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )

            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [14]:
# initialize and compile distiller
distiller = Distiller(student=model_student, teacher=model_teacher)
distiller.compile(
    optimizer='adam',
    metrics=['sparse_categorical_accuracy'],
    student_loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=tf.keras.losses.KLDivergence(),
    alpha=0.7,
    temperature=2,
)

In [15]:
# distill teacher's knowledge to student
distiller.fit(x_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff4ef58fc40>

In [16]:
# evaluate the student
acc_train, student_loss_train = distiller.evaluate(x_train, y_train)
acc_test, student_loss_test = distiller.evaluate(x_test, y_test)



In [17]:
print(f'Student Test Loss: {student_loss_test:.3f}')
print(f'Student Train Loss: {student_loss_train:.3f}\n')

print(f'Test Accuracy: {acc_test*100:.3f} %')
print(f'Train Accuracy: {acc_train*100:.3f} %')

Student Test Loss: 0.428
Student Train Loss: 0.543

Test Accuracy: 77.170 %
Train Accuracy: 93.878 %


# Part C)

In [18]:
# set the optimizer, loss and metrics
model_student_scratch.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['sparse_categorical_accuracy']
)

In [19]:
# training a student model from scratch
model_student_scratch.fit(x_train, y_train, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff3bdf187c0>

In [20]:
# evaluate the student
loss_train, acc_train = model_student_scratch.evaluate(x_train, y_train, batch_size=32)
loss_test, acc_test = model_student_scratch.evaluate(x_test, y_test, batch_size=32)



In [21]:
print(f'Test Loss: {loss_test:.3f}')
print(f'Train Loss: {loss_train:.3f}\n')

print(f'Test Accuracy: {acc_test*100:.3f} %')
print(f'Train Accuracy: {acc_train*100:.3f} %')

Test Loss: 0.853
Train Loss: 0.107

Test Accuracy: 77.110 %
Train Accuracy: 96.396 %


As can be seen, the test accuracy of the distilled model is slightly higher than the student model that we trained from scratch. This is due to the fact that the distilled model learned its knowledge (layer weights) from a teacher model. However, the train accuracy of the student model that we trained from scratch is roughly 3% higher, which suggests that this student model is overfitted to the training data given the huge difference between the test and train accuracy.

# Part D)

In [22]:
# unfreeze layers for fine tuning the whole model
for layer in model_teacher.layers:
    layer.trainable = True

In [23]:
# set a small learning rate for fine tuning
model_teacher.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=1e-4, momentum=0.9),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['sparse_categorical_accuracy']
)

In [24]:
# fine tune the teacher model
model_teacher.fit(x_train, y_train, batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff4265d3520>

In [25]:
# evaluate the teacher
loss_train, acc_train = model_teacher.evaluate(x_train, y_train, batch_size=32)
loss_test, acc_test = model_teacher.evaluate(x_test, y_test, batch_size=32)



In [26]:
print(f'Test Loss: {loss_test:.3f}')
print(f'Train Loss: {loss_train:.3f}\n')

print(f'Test Accuracy: {acc_test*100:.3f} %')
print(f'Train Accuracy: {acc_train*100:.3f} %')

Test Loss: 0.874
Train Loss: 0.536

Test Accuracy: 73.370 %
Train Accuracy: 83.550 %


As expected, the teacher model accuracy has increased. Now we distill the teacher's knowledge to a new student.

In [27]:
# initialize and compile distiller
distiller = Distiller(student=model_student_new, teacher=model_teacher)
distiller.compile(
    optimizer='adam',
    metrics=['sparse_categorical_accuracy'],
    student_loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=tf.keras.losses.KLDivergence(),
    alpha=0.7,
    temperature=2,
)

In [28]:
# distill teacher's knowledge to student
distiller.fit(x_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff374875550>

In [29]:
# evaluate the student
acc_train, student_loss_train = distiller.evaluate(x_train, y_train)
acc_test, student_loss_test = distiller.evaluate(x_test, y_test)



In [30]:
print(f'Student Test Loss: {student_loss_test:.3f}')
print(f'Student Train Loss: {student_loss_train:.3f}\n')

print(f'Test Accuracy: {acc_test*100:.3f} %')
print(f'Train Accuracy: {acc_train*100:.3f} %')

Student Test Loss: 0.438
Student Train Loss: 0.744

Test Accuracy: 78.700 %
Train Accuracy: 95.146 %


Compared to the other student model, which gained its knowledge from the previous teacher, the test and train accuracy of the new student model has increased by about 2%. So it can be said that the better we train the teacher, the higher accuracy we can expect from the student after the distillation.

In [31]:
# save the models
model_teacher.save('./Models_P2/model_teacher.h5')
model_student_scratch.save('./Models_P2/model_student_scratch.h5')
model_student.compile()
model_student.save('./Models_P2/model_student.h5')
model_student_new.compile()
model_student_new.save('./Models_P2/model_student_new.h5')