<a href="https://colab.research.google.com/github/Gustave-MB/my-torch/blob/main/HW3/P2/ArcFace_and_SphereFace_Starter_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torchsummary import summary
import torchvision #This library is used for image-based operations (Augmentations)

import os
import gc
from tqdm import tqdm
import math
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import glob

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", DEVICE)

# ArcFace Loss

[ArcFace: Additive Angular Margin Loss for Deep
Face Recognition](https://arxiv.org/pdf/1801.07698.pdf) [equation 3]

ArcFace Loss is trying to maximize the geodesic distance on the hypersphere between features of different classes to make the features more separately. Here is a blog that explains ArcFace Loss in detail: [link](https://medium.com/analytics-vidhya/face-recognition-and-arcface-additive-angular-margin-loss-for-deep-face-recognition-44abc56916c#:~:text=The%20ArcFace%20loss%20maximizes%20the,implemented%20with%20negligible%20computational%20overhead)

$$L_{afl} = - log \frac{e^{scos(\theta_{y_i} + m)}}{e^{s cos(\theta_{y_i} + m)} + \sum_{j=1,j \neq y_i}^N e^{s cos(\theta_j)}}$$

Play around with the `margin` and `scaler` hyperparameters as they are instrumental to the performance of this loss in fine tuning your model.



In [None]:
class ArcFaceModel(torch.nn.Module):
    '''
    To train in a standard training loop make sure to modify the train function so you pass in the inputs and the labels
    i.e. output = model(images, labels)
    '''
    def __init__(self, model, margin=0.5, scaler=64, embedding_size=NotImplemented, num_classes=NotImplemented):
        super(ArcFaceModel, self).__init__()
        self.embedding_size = embedding_size
        self.num_classes = num_classes

        # small number to avoid invalid arcCos values
        self.eps = 1e-7

        # hyperparameters
        self.margin = margin
        self.scaler = scaler

        # load classification model
        self.model = model

        # Initializing the arcface linear layer with the weights of the classifier from the trained CNN
        self.AFL_linear = torch.nn.Linear(embedding_size, num_classes, bias=False) # Why set bias=False? Check out the paper.
        with torch.no_grad():
          self.AFL_linear.weight.copy_(self.model.cls_layer.weight)

        # Initializing utility functions for normalization, arcCos, cos and onehot encoding
        self.normalizer = torch.nn.functional.normalize
        self.arcCos = torch.acos
        self.cos = torch.cos
        self.one_hot = torch.nn.functional.one_hot


    def forward(self, x, labels):
        # Get face embedding. Note that we pass return_feats=True to get the image's features and not the final logits.
        embedding = self.model(x, return_feats=True)

        # TODO: normalize face embedding
        embedding = NotImplemented

        # TODO: normalize linear layer weights.
        # NOTE: The normalized weights need to be wrapped in torch.nn.Parameter before assigning to AFL_linear.
        with torch.no_grad():
          self.AFL_linear.weight = torch.nn.Parameter(NotImplemented)

        # TODO: take dot product to get cos theta, remember that Wx = ||W||||x||cos(\theta) and ||W|| = 1, ||x|| = 1
        cosine = NotImplemented

        # We clamp the values to be a little higher than -1 and a little lower than one so we don't get nan values when we call arccos
        cosine = torch.clamp(cosine, min=-1.0+self.eps, max=1.0-self.eps)

        # TODO: get theta by performing arccos(cos(theta))
        theta = NotImplemented

        # TODO: convert labels to one-hot
        one_hot_labels = NotImplemented
        # TODO: create a mask with m at positions with label 1 and 0 at positions with label 0
        margin_mask = NotImplemented
        # TODO: add margin m to theta
        theta_m = NotImplemented

        # calculate the cosine value for theta with margin added and scale with self.scaler
        logits = NotImplemented # this value is then passed to crossEntropyLoss in train loop to calculate arcface loss

        return logits

# SphereFace Loss
[SphereFace: Deep Hypersphere Embedding for Face Recognition](https://arxiv.org/pdf/1704.08063.pdf)

[SphereFace Revived:
Unifying Hyperspherical Face Recognition](https://arxiv.org/pdf/2109.05565.pdf)

$$L_{sfl} = - log \frac{e^{scos(m\theta_{y_i})}}{e^{s cos(m\theta_{y_i})} + \sum_{j=1,j \neq y_i}^N e^{s cos(\theta_j)}}$$

Notice that the only difference between arcface loss and sphere loss is from $e^{scos(\theta_{y_i} + m)}$ to $e^{scos(m\theta_{y_i})}$. You should be able to implement this based on the comments in ArcFace loss and update `margin_mask` variable accordingly.

Play around with the `margin` and `scaler` hyperparameters as they are instrumental to the performance of this loss in fine tuning your model.

Please note that this is a basic version of SphereFace loss. As you can read in the above listed papers, there are several modifications you can make to it.


In [None]:
class SphereFaceModel(torch.nn.Module):
    '''
    To train in a standard training loop make sure to modify the train function so you pass in the inputs and the labels
    i.e. output = model(images, labels)
    '''
    def __init__(self, model, margin=0.5, scaler=64, embedding_size=NotImplemented, num_classes=NotImplemented):
        super(SphereFaceModel, self).__init__()
        self.embedding_size = embedding_size
        self.num_classes = num_classes

        # small number to avoid invalid arcCos values
        self.eps = 1e-7

        # hyperparameters
        self.margin = margin
        self.scaler = scaler

        # load classification model
        self.model = model

        # Initializing the arcface linear layer with the weights of the classifier from the trained CNN
        self.AFL_linear = torch.nn.Linear(embedding_size, num_classes, bias=False) # Why set bias=False? Check out the paper.
        with torch.no_grad():
          self.AFL_linear.weight.copy_(self.model.cls_layer.weight)

        # Initializing utility functions for normalization, arcCos, cos and onehot encoding
        self.normalizer = torch.nn.functional.normalize
        self.arcCos = torch.acos
        self.cos = torch.cos
        self.one_hot = torch.nn.functional.one_hot


    def forward(self, x, labels):
        # Get face embedding. Note that we pass return_feats=True to get the image's features and not the final logits.
        embedding = self.model(x, return_feats=True)

        # TODO: normalize face embedding
        embedding = NotImplemented

        # TODO: normalize linear layer weights.
        # NOTE: The normalized weights need to be wrapped in torch.nn.Parameter before assigning to AFL_linear.
        with torch.no_grad():
          self.AFL_linear.weight = torch.nn.Parameter(NotImplemented)

        # TODO: take dot product to get cos theta, remember that Wx = ||W||||x||cos(\theta) and ||W|| = 1, ||x|| = 1
        cosine = NotImplemented

        # We clamp the values to be a little higher than -1 and a little lower than one so we don't get nan values when we call arccos
        cosine = torch.clamp(cosine, min=-1.0+self.eps, max=1.0-self.eps)

        # TODO: get theta by performing arccos(cos(theta))
        theta = NotImplemented

        # TODO: convert labels to one-hot
        one_hot_labels = NotImplemented
        # TODO: create a mask with m at positions with label 1 and 0 at positions with label 0
        margin_mask = NotImplemented
        # TODO: multiply margin m to theta
        theta_m = NotImplemented

        # calculate the cosine value for theta with margin multiplied and scale with self.scaler
        logits = NotImplemented # this value is then passed to crossEntropyLoss in train loop to calculate sphereface loss

        return logits
