In [None]:
%load_ext autoreload
%autoreload 2

# Définition du seuil

source: [Fine tuning the threshold in face recognition](https://sefiks.com/2020/05/22/fine-tuning-the-threshold-in-face-recognition/)

## Génération des identitées

In [None]:
import os
import pandas as pd
import itertools
import cv2
from chefboost import Chefboost as chef
from sklearn.metrics import confusion_matrix
from seaborn import heatmap
from matplotlib import pyplot as plt
import multiprocessing as mp
current_dir = os.getcwd()
new_dir = current_dir.replace("\\tests", "")
os.chdir(new_dir)

from services.faces.face_detector import FaceDetector
from services.faces.comparators.yolo_comparator import YoloComparator
from services.images.image_editor import ImageEditor

In [None]:
dataset_root = "C:\\Users\\Naofel\\Downloads\\VGG-Face2\\exp10\\val"

def get_person_files(dataset_root) -> dict:
    person_files = {}

    for root, dirs, files in os.walk(dataset_root):
        for dir in dirs:
            person_name = dir
            person_dir = os.path.join(root, dir)
            person_files[person_name] = []

            for file in os.listdir(person_dir):
                if file.endswith(".jpg"):
                    person_files[person_name].append(dir + '\\' + file)

    return person_files

identities = get_person_files(dataset_root)

print(identities)

## Génération de paires positives

In [None]:
positives = []
for key, values in identities.items():
    for i in range(0, len(values)-1):
        for j in range(i+1, len(values)):
            positives.append([values[i], values[j]])

positives = pd.DataFrame(positives, columns = ["file_x", "file_y"])
positives["decision"] = "Yes"

positives

## Génération de paires négatives

In [None]:
samples_list = list(identities.values())

negatives = []
for i in range(0, len(identities) - 1):
    for j in range(i+1, len(identities)):
        cross_product = itertools.product(samples_list[i], samples_list[j])
        cross_product = list(cross_product)

    for cross_sample in cross_product:
        negatives.append([cross_sample[0], cross_sample[1]])

negatives = pd.DataFrame(negatives, columns = ["file_x", "file_y"])
negatives["decision"] = "No"

negatives

## Merge pairs

In [None]:
df: pd.DataFrame = pd.concat([positives, negatives]).reset_index(drop = True)
df.file_x = df.file_x
df.file_y = df.file_y

df

## Distances between pairs

### Comparisons

In [None]:
face_comparator = YoloComparator()
face_detector = FaceDetector()

In [None]:
print(len(distances))
print(len(df))

In [None]:
# Delete lines with distance < 0.1 because they are probably the same person

print("Before:", len(df))
df = df[df["distance"] >= 0.1]
print("After:", len(df))

### Analyze

In [None]:
tp_mean = round(df[df.decision == "Yes"]["distance"].mean(), 4)
tp_std = round(df[df.decision == "Yes"]["distance"].std(), 4)
fp_mean = round(df[df.decision == "No"]["distance"].mean(), 4)
fp_std = round(df[df.decision == "No"]["distance"].std(), 4)

print(f"True positive mean: {tp_mean} - True positive std: {tp_std}")
print(f"False positive mean: {fp_mean} - False positive std: {fp_std}")

In [None]:
df[df.decision == "Yes"].distance.plot.kde()
df[df.decision == "No"].distance.plot.kde()

### Treshold calculation

In [None]:
sigma_2_threshold = round(tp_mean + 2 * tp_std, 4) # Corresponds to 95.45% confidence 
sigma_3_threshold = round(tp_mean + 3 * tp_std, 4) # Corresponds to 99.73% confidence
print(f"2 Sigma threshold: {sigma_2_threshold}")
print(f"3 Sigma threshold: {sigma_3_threshold}")

In [None]:
# Generate an output folder and a rules.py file containing the calculated threshold

config = {'algorithm': 'C4.5'}
tmp_df = df[['distance', 'decision']].rename(columns={"decision": "Decision"}).copy()
print(tmp_df)
model = chef.fit(df=tmp_df, config=config, target_label='decision')

In [None]:
decision_tree_threshold = 0.3336793708127679

### Verification

In [None]:
df["prediction"] = "No"

df.loc[df.distance <= decision_tree_threshold, 'decision_tree_prediction'] = 'Yes'
df.loc[df.distance <= sigma_2_threshold, 'sigma_2_prediction'] = 'Yes'
df.loc[df.distance <= sigma_3_threshold, 'sigma_3_prediction'] = 'Yes'
df.loc[df.distance > decision_tree_threshold, 'decision_tree_prediction'] = 'No'
df.loc[df.distance > sigma_2_threshold, 'sigma_2_prediction'] = 'No'
df.loc[df.distance > sigma_3_threshold, 'sigma_3_prediction'] = 'No'

df

### Evaluation

In [None]:
decision_tree_cm = confusion_matrix(df.decision.values, df.decision_tree_prediction.values)
sigma_2_cm = confusion_matrix(df.decision.values, df.sigma_2_prediction.values)
sigma_3_cm = confusion_matrix(df.decision.values, df.sigma_3_prediction.values)

def plot_cm_with_metrics(cm, title, threshold):
    tn, fp, fn, tp = cm.ravel()
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tn + fp + fn + tp)
    f1 = 2 * (precision * recall) / (precision + recall)

    plt.figure(figsize=(cm.shape[0] + 2, cm.shape[1]))
    plt.subplot(1, 2, 1)
    heatmap(cm, annot=True, fmt="d", cmap="RdYlBu_r")
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title(title)

    plt.subplot(1, 2, 2)
    plt.axis('off')
    plt.text(0, 0.8, f"Recall: {recall:.3f}", fontsize=10, ha='left')
    plt.text(0, 0.6, f"Precision: {precision:.3f}", fontsize=10, ha='left')
    plt.text(0, 0.4, f"Accuracy: {accuracy:.3f}", fontsize=10, ha='left')
    plt.text(0, 0.2, f"F1: {f1:.3f}", fontsize=10, ha='left')
    plt.text(0, 0, f"Threshold: {threshold:.4f}", fontsize=10, ha='left')

    plt.tight_layout()
    plt.show()

plot_cm_with_metrics(decision_tree_cm, "Decision tree confusion matrix", threshold=decision_tree_threshold)
plot_cm_with_metrics(sigma_2_cm, "2-Sigma confusion matrix", threshold=sigma_2_threshold)
plot_cm_with_metrics(sigma_3_cm, "3-Sigma confusion matrix", threshold=sigma_3_threshold)