In [1]:
import warnings

warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

In [2]:
# Read in the survey data
df = pd.read_pickle("../data/survey.pkl")

In [3]:
# Calculate Accuracy for each Model and Dataset
accs = df.groupby(["dataset", "model"]).apply(lambda x: np.sum(x["prediction"] == x["label"]) / len(x)).unstack().T
print(accs.to_markdown(floatfmt=".2%"))

| model      |   base |   calfw |   cplfw |    lfw |   mlfw |   sllfw |   xqlfw |
|:-----------|-------:|--------:|--------:|-------:|-------:|--------:|--------:|
| AIMax      | 85.00% |  95.97% |  92.53% | 99.80% | 89.48% |  97.45% |  94.43% |
| AIMean     | 85.00% |  95.72% |  92.38% | 99.82% | 88.60% |  97.12% |  94.58% |
| AIMin      | 75.00% |  94.05% |  88.97% | 99.60% | 76.72% |  95.15% |  87.78% |
| ArcFace*   | 67.50% |  93.85% |  88.37% | 99.55% | 73.53% |  94.90% |  93.27% |
| FaceTrans* | 82.50% |  94.93% |  91.58% | 99.73% | 85.63% |  96.78% |  95.12% |
| Human      | 67.25% |  66.23% |  68.38% |   nan% | 64.48% |    nan% |  70.19% |
| HumanMax   | 70.00% |    nan% |    nan% |   nan% |   nan% |    nan% |    nan% |
| HumanMean  | 70.00% |    nan% |    nan% |   nan% |   nan% |    nan% |    nan% |
| ProdPoly   | 82.50% |  96.03% |  92.75% | 99.80% | 91.30% |  97.47% |  86.90% |


In [4]:
# Make Evaluation with TP FP etc.
df["tp"] = (df["prediction"] == 1) & (df["label"] == 1)
df["fp"] = (df["prediction"] == 1) & (df["label"] == 0)
df["tn"] = (df["prediction"] == 0) & (df["label"] == 0)
df["fn"] = (df["prediction"] == 0) & (df["label"] == 1)

# Display in a Matrix for each Dataset and Model
metrics = df.groupby(["dataset", "model"]).apply(lambda x: x[["tp", "fp", "tn", "fn"]].sum()).reset_index()

# Print Metrics
print(metrics.to_markdown())

|    | dataset   | model      |   tp |   fp |   tn |   fn |
|---:|:----------|:-----------|-----:|-----:|-----:|-----:|
|  0 | base      | AIMax      |   15 |    1 |   19 |    5 |
|  1 | base      | AIMean     |   15 |    1 |   19 |    5 |
|  2 | base      | AIMin      |   14 |    4 |   16 |    6 |
|  3 | base      | ArcFace*   |   12 |    5 |   15 |    8 |
|  4 | base      | FaceTrans* |   15 |    2 |   18 |    5 |
|  5 | base      | Human      |  614 |  200 | 1000 |  586 |
|  6 | base      | HumanMax   |    8 |    0 |   20 |   12 |
|  7 | base      | HumanMean  |    8 |    0 |   20 |   12 |
|  8 | base      | ProdPoly   |   13 |    0 |   20 |    7 |
|  9 | calfw     | AIMax      | 2777 |   19 | 2981 |  223 |
| 10 | calfw     | AIMean     | 2773 |   30 | 2970 |  227 |
| 11 | calfw     | AIMin      | 2729 |   86 | 2914 |  271 |
| 12 | calfw     | ArcFace*   | 2729 |   98 | 2902 |  271 |
| 13 | calfw     | FaceTrans* | 2772 |   76 | 2924 |  228 |
| 14 | calfw     | Human      |  188 |  

In [5]:
# From the metrics calculate the accuracy, precision, recall and f1 score for each model and dataset
metrics["accuracy"] = (metrics["tp"] + metrics["tn"]) / (metrics["tp"] + metrics["tn"] + metrics["fp"] + metrics["fn"])
metrics["precision"] = metrics["tp"] / (metrics["tp"] + metrics["fp"])
metrics["sensitivity"] = metrics["tp"] / (metrics["tp"] + metrics["fn"])
metrics["f1"] = 2 * metrics["precision"] * metrics["sensitivity"] / (metrics["precision"] + metrics["sensitivity"])
metrics["FNR"] = 1 - metrics["sensitivity"]

# Drop the tp, fp, tn, fn columns
metrics_clean = metrics.drop(["tp", "fp", "tn", "fn"], axis=1)

# Print Metrics for base Dataset and each Model
print(metrics_clean[metrics_clean["dataset"] == "base"].to_markdown(floatfmt=".2%"))
metrics_clean[metrics_clean["dataset"] == "base"].to_clipboard()

|    | dataset   | model      |   accuracy |   precision |   sensitivity |     f1 |    FNR |
|---:|:----------|:-----------|-----------:|------------:|--------------:|-------:|-------:|
|  0 | base      | AIMax      |     85.00% |      93.75% |        75.00% | 83.33% | 25.00% |
|  1 | base      | AIMean     |     85.00% |      93.75% |        75.00% | 83.33% | 25.00% |
|  2 | base      | AIMin      |     75.00% |      77.78% |        70.00% | 73.68% | 30.00% |
|  3 | base      | ArcFace*   |     67.50% |      70.59% |        60.00% | 64.86% | 40.00% |
|  4 | base      | FaceTrans* |     82.50% |      88.24% |        75.00% | 81.08% | 25.00% |
|  5 | base      | Human      |     67.25% |      75.43% |        51.17% | 60.97% | 48.83% |
|  6 | base      | HumanMax   |     70.00% |     100.00% |        40.00% | 57.14% | 60.00% |
|  7 | base      | HumanMean  |     70.00% |     100.00% |        40.00% | 57.14% | 60.00% |
|  8 | base      | ProdPoly   |     82.50% |     100.00% |        65.0