In [1]:
import torch
import torch.nn as nn

import torch.optim as optim
from torch.optim import SGD
import torch.nn.functional as F

from torchvision import transforms, datasets
from torch.utils.data import DataLoader, random_split, Subset

from torchinfo import summary

import torchvision.models as models

import os

#import models
from mobiface_like_v2 import MobiFace
from backbone import get_model
import model


import torchvision


from facenet_pytorch import MTCNN, InceptionResnetV1

from torch.cuda.amp import autocast, GradScaler

import time

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.neighbors import NearestNeighbors


import numpy as np
import csv
import requests 

import matplotlib.pyplot as plt

import time
import datetime
import random

MobiFace(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (depthwise_conv): DepthwiseSeparableConv2d(
    (depthwise_conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
    (pointwise_conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): PReLU(num_parameters=1)
  )
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bottleneck_block1): BottleneckBlock(
    (conv1): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): PReLU(num_parameters=1)
    (depthwise_conv): DepthwiseSeparableConv2d(
      (depthwise_conv): Conv2d(32, 32, kernel_size=(3, 3), stri

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Set a random seed for reproducibility
seed = 42
torch.manual_seed(seed)

# If using GPU, also set the seed for GPU
torch.cuda.manual_seed_all(seed)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:

num_classes = 10575
student = MobiFace()
student.fc = nn.Linear(in_features=512, out_features=num_classes).to(device)

In [5]:
summary(student, (1,3,112,112))

Layer (type:depth-idx)                        Output Shape              Param #
MobiFace                                      [1, 10575]                --
├─Conv2d: 1-1                                 [1, 64, 56, 56]           1,728
├─BatchNorm2d: 1-2                            [1, 64, 56, 56]           128
├─PReLU: 1-3                                  [1, 64, 56, 56]           1
├─DepthwiseSeparableConv2d: 1-4               [1, 64, 56, 56]           --
│    └─Conv2d: 2-1                            [1, 64, 56, 56]           576
│    └─Conv2d: 2-2                            [1, 64, 56, 56]           4,096
│    └─BatchNorm2d: 2-3                       [1, 64, 56, 56]           128
│    └─PReLU: 2-4                             [1, 64, 56, 56]           1
├─BatchNorm2d: 1-5                            [1, 64, 56, 56]           128
├─PReLU: 1-6                                  [1, 64, 56, 56]           (recursive)
├─BottleneckBlock: 1-7                        [1, 64, 28, 28]           --
│  

In [6]:
teacher =  torch.load("full_webcassia_finetuned_v2.pth") 


In [7]:
teacher.eval()

InceptionResnetV1(
  (conv2d_1a): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2a): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2b): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (maxpool_3a): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2d_3b): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_4a): 

In [8]:
summary(teacher, (1,3,112,112))

Layer (type:depth-idx)                        Output Shape              Param #
InceptionResnetV1                             [1, 10575]                --
├─BasicConv2d: 1-1                            [1, 32, 55, 55]           --
│    └─Conv2d: 2-1                            [1, 32, 55, 55]           (864)
│    └─BatchNorm2d: 2-2                       [1, 32, 55, 55]           (64)
│    └─ReLU: 2-3                              [1, 32, 55, 55]           --
├─BasicConv2d: 1-2                            [1, 32, 53, 53]           --
│    └─Conv2d: 2-4                            [1, 32, 53, 53]           (9,216)
│    └─BatchNorm2d: 2-5                       [1, 32, 53, 53]           (64)
│    └─ReLU: 2-6                              [1, 32, 53, 53]           --
├─BasicConv2d: 1-3                            [1, 64, 53, 53]           --
│    └─Conv2d: 2-7                            [1, 64, 53, 53]           (18,432)
│    └─BatchNorm2d: 2-8                       [1, 64, 53, 53]           (128)

KD Process

In [9]:
# Define data transformations
transform = transforms.Compose([
    transforms.Resize((112, 112)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

In [10]:
# directory for test
#root_dir= "C:\\Users\\mathe\\OneDrive\\Área de Trabalho\\master\\TFM\\dataset\\faces_webface_112x112\\small_sample"
root_dir= "C:\\Users\\mathe\\OneDrive\\Área de Trabalho\\master\\TFM\\dataset\\faces_webface_112x112\\images"

batch_size = 128

# Create ImageFolder dataset
dataset = datasets.ImageFolder(root=root_dir, transform=transform)

# Number of images for testing
num_test_images = 2000

# Total number of images
total_images = len(dataset)

# Indices of images for testing
test_indices = random.sample(range(total_images), num_test_images)

# Remaining indices for validation and training
remaining_indices = set(range(total_images)) - set(test_indices)


# Split remaining indices into validation and training sets
remaining_indices = list(remaining_indices)
random.shuffle(remaining_indices)


# Define the sizes of validation and training sets
val_size = int(0.30 * len(remaining_indices))
train_size = len(remaining_indices) - val_size

# Indices for validation and training sets
val_indices = remaining_indices[:val_size]
train_indices = remaining_indices[val_size:]


# Create Subset datasets
test_dataset = Subset(dataset, test_indices)
val_dataset = Subset(dataset, val_indices)
train_dataset = Subset(dataset, train_indices)


train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [12]:
student_model = student
teacher_model = teacher

teacher_model.to(device)
student_model.to(device)

MobiFace(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (depthwise_conv): DepthwiseSeparableConv2d(
    (depthwise_conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
    (pointwise_conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): PReLU(num_parameters=1)
  )
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bottleneck_block1): BottleneckBlock(
    (conv1): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): PReLU(num_parameters=1)
    (depthwise_conv): DepthwiseSeparableConv2d(
      (depthwise_conv): Conv2d(32, 32, kernel_size=(3, 3), stri

In [13]:
start_time = time.time()
formatted_time = datetime.datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S')

# Define the temperature parameter for knowledge distillation
temperature = 4.0  # You can adjust this value based on your needs
# Define the loss function and optimizer for training the student model
criterion = nn.CrossEntropyLoss()
optimizer_student = optim.Adam(student_model.parameters(), lr=0.001)

# Training loop
num_epochs = 15  # Adjust as needed
for epoch in range(num_epochs):
    student_model.train()

    #define running loss
    running_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass for the teacher model (assuming it's already trained)
        with torch.no_grad():
            outputs_teacher = teacher_model(inputs)

        # Forward pass for the student model
        optimizer_student.zero_grad()
        outputs_student = student_model(inputs)

        #Calculates the knowledge distillation loss using the Kullback-Leibler (KL) Divergence loss.It measures the difference between two probability distributions. In this case, it calculates the KL Divergence between the log-softmax predictions of the student model and the softmax predictions of the teacher model.

        loss_distillation = nn.KLDivLoss(reduction='batchmean')(F.log_softmax(outputs_student / temperature, dim=1), #This part computes the logarithm of the softmax function applied to the output predictions of the student model divided by the temperature. The temperature is a hyperparameter that controls the smoothness of the probability distribution.
                                           F.softmax(outputs_teacher / temperature, dim=1)) #Similarly, this part computes the softmax function applied to the output predictions of the teacher model divided by the temperature.

        # Calculate the classification loss
        loss_classification = criterion(outputs_student, labels)

        # Total loss
        loss = loss_classification + loss_distillation

        # Backward and optimize
        loss.backward()
        optimizer_student.step()

        running_loss += loss.item()
        _, predicted = outputs_student.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()


        # Print batch statistics
        if batch_idx % 100 == 0:  # Adjust the interval for printing
            batch_accuracy = 100 * correct / total
            print(f'Epoch [{epoch + 1}/{num_epochs}], Batch [{batch_idx + 1}/{len(train_loader)}], '
                  f'Loss: {running_loss / (batch_idx + 1):.4f}, correct :{correct}, total: {total}, Batch Accuracy: {batch_accuracy:.2f}%')

    # Validation phase
    student_model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        for val_batch_idx, (val_inputs, val_labels) in enumerate(val_loader):
            val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)

            # Forward pass for the teacher model
            with torch.no_grad():
                val_outputs_teacher = teacher_model(val_inputs)

            # Forward pass for the student model
            val_outputs_student = student_model(val_inputs)

            _, val_predicted = val_outputs_student.max(1)
            val_total += val_labels.size(0)
            val_correct += val_predicted.eq(val_labels).sum().item()

        # Calculate validation accuracy
        val_accuracy = 100 * val_correct / val_total

        # Print validation statistics
        print(f'Epoch [{epoch + 1}/{num_epochs}], Validation Loss: {val_loss / len(val_loader):.4f}, '
            f'Validation Accuracy: {val_accuracy:.2f}%')
            
    checkpoint = {
        'epoch': epoch,
        'student_model_state_dict': student_model.state_dict(),
        'optimizer_student_state_dict': optimizer_student.state_dict(),
        'val_accuracy': val_accuracy,
    }
    torch.save(checkpoint, f'checkpoint_epoch_kd_v2_{epoch}.pt')
    
end_time = time.time()

formatted_time = datetime.datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S')
print(formatted_time)

execution_time = end_time - start_time

with open("execution_time_kd_v2.txt", "w") as file:
    file.write(f"Execution time: {execution_time} seconds")

Epoch [1/15], Batch [1/2673], Loss: 10.2471, correct :0, total: 128, Batch Accuracy: 0.00%
Epoch [1/15], Batch [101/2673], Loss: 10.0705, correct :19, total: 12928, Batch Accuracy: 0.15%
Epoch [1/15], Batch [201/2673], Loss: 9.9320, correct :58, total: 25728, Batch Accuracy: 0.23%
Epoch [1/15], Batch [301/2673], Loss: 9.8176, correct :114, total: 38528, Batch Accuracy: 0.30%
Epoch [1/15], Batch [401/2673], Loss: 9.7114, correct :194, total: 51328, Batch Accuracy: 0.38%
Epoch [1/15], Batch [501/2673], Loss: 9.6043, correct :300, total: 64128, Batch Accuracy: 0.47%
Epoch [1/15], Batch [601/2673], Loss: 9.5028, correct :435, total: 76928, Batch Accuracy: 0.57%
Epoch [1/15], Batch [701/2673], Loss: 9.4096, correct :604, total: 89728, Batch Accuracy: 0.67%
Epoch [1/15], Batch [801/2673], Loss: 9.3211, correct :820, total: 102528, Batch Accuracy: 0.80%
Epoch [1/15], Batch [901/2673], Loss: 9.2346, correct :1063, total: 115328, Batch Accuracy: 0.92%
Epoch [1/15], Batch [1001/2673], Loss: 9.15

In [14]:
torch.save(student_model, 'KD_full_mobiFace_like_v2_4.pth') 
torch.save(student_model.state_dict(), 'KD_dict_mobiFace_live_v2_4.pth') 