In [1]:
import torch
import torch.nn as nn

import torch.optim as optim
from torch.optim import SGD
import torch.nn.functional as F

from torchvision import transforms, datasets
from torch.utils.data import DataLoader, random_split

from torchinfo import summary

import torchvision.models as models

import os

#import models
from mobiface_like_v2 import MobiFace
from backbone import get_model
import model


import torchvision


from facenet_pytorch import MTCNN, InceptionResnetV1

from torch.cuda.amp import autocast, GradScaler

import time

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.neighbors import NearestNeighbors


import numpy as np
import csv
import requests 

MobiFace(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (depthwise_conv): DepthwiseSeparableConv2d(
    (depthwise_conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
    (pointwise_conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): PReLU(num_parameters=1)
  )
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bottleneck_block1): BottleneckBlock(
    (conv1): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): PReLU(num_parameters=1)
    (depthwise_conv): DepthwiseSeparableConv2d(
      (depthwise_conv): Conv2d(32, 32, kernel_size=(3, 3), stri

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set a random seed for reproducibility
seed = 42
torch.manual_seed(seed)

# If using GPU, also set the seed for GPU
torch.cuda.manual_seed_all(seed)


In [3]:
student = MobiFace()
#student = student.to(dtype=torch.float16,device=torch.cuda.current_device())

In [4]:
summary(student, (1,3,112,112))

Layer (type:depth-idx)                        Output Shape              Param #
MobiFace                                      [1, 512]                  --
├─Conv2d: 1-1                                 [1, 64, 56, 56]           1,728
├─BatchNorm2d: 1-2                            [1, 64, 56, 56]           128
├─PReLU: 1-3                                  [1, 64, 56, 56]           1
├─DepthwiseSeparableConv2d: 1-4               [1, 64, 56, 56]           --
│    └─Conv2d: 2-1                            [1, 64, 56, 56]           576
│    └─Conv2d: 2-2                            [1, 64, 56, 56]           4,096
│    └─BatchNorm2d: 2-3                       [1, 64, 56, 56]           128
│    └─PReLU: 2-4                             [1, 64, 56, 56]           1
├─BatchNorm2d: 1-5                            [1, 64, 56, 56]           128
├─PReLU: 1-6                                  [1, 64, 56, 56]           (recursive)
├─BottleneckBlock: 1-7                        [1, 64, 28, 28]           --
│  

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
teacher =  torch.load("full_webcassia_finetuned_v2.pth") 



In [7]:
teacher.eval()

InceptionResnetV1(
  (conv2d_1a): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2a): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2b): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (maxpool_3a): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2d_3b): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_4a): 

In [8]:
summary(teacher, (1,3,112,112))

Layer (type:depth-idx)                        Output Shape              Param #
InceptionResnetV1                             [1, 10575]                --
├─BasicConv2d: 1-1                            [1, 32, 55, 55]           --
│    └─Conv2d: 2-1                            [1, 32, 55, 55]           (864)
│    └─BatchNorm2d: 2-2                       [1, 32, 55, 55]           (64)
│    └─ReLU: 2-3                              [1, 32, 55, 55]           --
├─BasicConv2d: 1-2                            [1, 32, 53, 53]           --
│    └─Conv2d: 2-4                            [1, 32, 53, 53]           (9,216)
│    └─BatchNorm2d: 2-5                       [1, 32, 53, 53]           (64)
│    └─ReLU: 2-6                              [1, 32, 53, 53]           --
├─BasicConv2d: 1-3                            [1, 64, 53, 53]           --
│    └─Conv2d: 2-7                            [1, 64, 53, 53]           (18,432)
│    └─BatchNorm2d: 2-8                       [1, 64, 53, 53]           (128)

In [9]:
teacher.eval()

InceptionResnetV1(
  (conv2d_1a): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2a): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2b): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (maxpool_3a): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2d_3b): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_4a): 

In [10]:
student.eval()

MobiFace(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (depthwise_conv): DepthwiseSeparableConv2d(
    (depthwise_conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
    (pointwise_conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): PReLU(num_parameters=1)
  )
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bottleneck_block1): BottleneckBlock(
    (conv1): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): PReLU(num_parameters=1)
    (depthwise_conv): DepthwiseSeparableConv2d(
      (depthwise_conv): Conv2d(32, 32, kernel_size=(3, 3), stri

In [11]:
num_classes = 10575
student.fc = nn.Linear(in_features=512, out_features=num_classes).to(device)

KD Process

In [12]:
# Define data transformations
transform = transforms.Compose([
    transforms.Resize((112, 112)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

In [21]:
# directory for test
#root_dir= "C:\\Users\\mathe\\OneDrive\\Área de Trabalho\\master\\TFM\\dataset\\faces_webface_112x112\\small_sample"
root_dir= "C:\\Users\\mathe\\OneDrive\\Área de Trabalho\\master\\TFM\\dataset\\faces_webface_112x112\\images"


batch_size = 64

# Create ImageFolder dataset
dataset = datasets.ImageFolder(root=root_dir, transform=transform)

# Create DataLoader for training
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)

# Split the dataset into training and validation sets
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size


train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size]
)


train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [16]:
student_model = student
teacher_model = teacher

teacher_model.to(device)
student_model.to(device)

MobiFace(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (depthwise_conv): DepthwiseSeparableConv2d(
    (depthwise_conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
    (pointwise_conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): PReLU(num_parameters=1)
  )
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bottleneck_block1): BottleneckBlock(
    (conv1): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): PReLU(num_parameters=1)
    (depthwise_conv): DepthwiseSeparableConv2d(
      (depthwise_conv): Conv2d(32, 32, kernel_size=(3, 3), stri

In [24]:
# Criterion, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()

# Test accuracy
teacher.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = teacher(inputs)
        test_loss = criterion(outputs, labels)

        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        total_samples += labels.size(0)
        total_correct += (predicted == labels).sum().item()

accuracy = total_correct / total_samples if total_samples > 0 else 0.0
print(f'Test Loss: {test_loss.item()}, Accuracy: {accuracy}')

Test Loss: 0.687118649482727, Accuracy: 0.9198032448297415


In [17]:
# Define the temperature parameter for knowledge distillation
temperature = 4.0  # You can adjust this value based on your needs
# Define the loss function and optimizer for training the student model
criterion = nn.CrossEntropyLoss()
optimizer_student = optim.Adam(student_model.parameters(), lr=0.001)

# Training loop
num_epochs = 10  # Adjust as needed
for epoch in range(num_epochs):
    student_model.train()

    #define running loss
    running_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass for the teacher model (assuming it's already trained)
        with torch.no_grad():
            outputs_teacher = teacher_model(inputs)

        # Forward pass for the student model
        optimizer_student.zero_grad()
        outputs_student = student_model(inputs)

        #Calculates the knowledge distillation loss using the Kullback-Leibler (KL) Divergence loss.It measures the difference between two probability distributions. In this case, it calculates the KL Divergence between the log-softmax predictions of the student model and the softmax predictions of the teacher model.

        loss_distillation = nn.KLDivLoss(reduction='batchmean')(F.log_softmax(outputs_student / temperature, dim=1), #This part computes the logarithm of the softmax function applied to the output predictions of the student model divided by the temperature. The temperature is a hyperparameter that controls the smoothness of the probability distribution.
                                           F.softmax(outputs_teacher / temperature, dim=1)) #Similarly, this part computes the softmax function applied to the output predictions of the teacher model divided by the temperature.

        # Calculate the classification loss
        loss_classification = criterion(outputs_student, labels)

        # Total loss
        loss = loss_classification + loss_distillation

        # Backward and optimize
        loss.backward()
        optimizer_student.step()

        running_loss += loss.item()
        _, predicted = outputs_student.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()


        # Print batch statistics
        if batch_idx % 100 == 0:  # Adjust the interval for printing
            batch_accuracy = 100 * correct / total
            print(f'Epoch [{epoch + 1}/{num_epochs}], Batch [{batch_idx + 1}/{len(train_loader)}], '
                  f'Loss: {running_loss / (batch_idx + 1):.4f}, correct :{correct}, total: {total}, Batch Accuracy: {batch_accuracy:.2f}%')

    # Validation phase
    student_model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        for val_batch_idx, (val_inputs, val_labels) in enumerate(val_loader):
            val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)

            # Forward pass for the teacher model
            with torch.no_grad():
                val_outputs_teacher = teacher_model(val_inputs)

            # Forward pass for the student model
            val_outputs_student = student_model(val_inputs)

            # Calculate the classification loss for validation
            val_loss_classification = criterion(val_outputs_student, val_labels)

            # Calculate the knowledge distillation loss for validation
            val_loss_distillation = nn.KLDivLoss(reduction='batchmean')(
                F.log_softmax(val_outputs_student / temperature, dim=1),
                F.softmax(val_outputs_teacher / temperature, dim=1)
            )

            # Total loss for validation
            val_loss_batch = val_loss_classification + val_loss_distillation

            val_loss += val_loss_batch.item()

            _, val_predicted = val_outputs_student.max(1)
            val_total += val_labels.size(0)
            val_correct += val_predicted.eq(val_labels).sum().item()

        # Calculate validation accuracy
        val_accuracy = 100 * val_correct / val_total

        # Print validation statistics
        print(f'Epoch [{epoch + 1}/{num_epochs}], Validation Loss: {val_loss / len(val_loader):.4f}, '
            f'Validation Accuracy: {val_accuracy:.2f}%')
            
            
    # Print training statistics
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}, '
          f'Accuracy: {100 * correct / total:.2f}%')



Epoch [1/10], Batch [1/5367], Loss: 10.1743, correct :0, total: 64, Batch Accuracy: 0.00%
Epoch [1/10], Batch [101/5367], Loss: 10.1203, correct :9, total: 6464, Batch Accuracy: 0.14%
Epoch [1/10], Batch [201/5367], Loss: 10.0228, correct :19, total: 12864, Batch Accuracy: 0.15%
Epoch [1/10], Batch [301/5367], Loss: 9.9555, correct :39, total: 19264, Batch Accuracy: 0.20%
Epoch [1/10], Batch [401/5367], Loss: 9.8972, correct :63, total: 25664, Batch Accuracy: 0.25%
Epoch [1/10], Batch [501/5367], Loss: 9.8459, correct :86, total: 32064, Batch Accuracy: 0.27%
Epoch [1/10], Batch [601/5367], Loss: 9.7932, correct :118, total: 38464, Batch Accuracy: 0.31%
Epoch [1/10], Batch [701/5367], Loss: 9.7374, correct :153, total: 44864, Batch Accuracy: 0.34%
Epoch [1/10], Batch [801/5367], Loss: 9.6820, correct :194, total: 51264, Batch Accuracy: 0.38%
Epoch [1/10], Batch [901/5367], Loss: 9.6289, correct :249, total: 57664, Batch Accuracy: 0.43%
Epoch [1/10], Batch [1001/5367], Loss: 9.5810, corr

In [22]:
# Evaluation loop
student_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = student_model(inputs)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print(f'Test Accuracy: {100 * correct / total:.2f}%')

Test Accuracy: 77.95%


In [23]:
torch.save(student_model, 'KD_full_mobiFace_like_v2.pth') 
torch.save(student_model.state_dict(), 'KD_dict_mobiFace_live_v2.pth') 