In [2]:
import pathlib
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from compare_w_reference import compare_w_ref
import json
import numpy as np
import sys
sys.path.append('../')
from helper.display_cluster import show_cluster, split_ids, locate_and_plot_image, plot_overview_cluster, faceId_to_ogId

# load the clustering

## DBSCAN

In [11]:
reference_clusters_path = pathlib.Path("../reference_clusters")

base_path = pathlib.Path("/media/bao/t7/la_lib_dataset")

src_folder = base_path / "img"
faces_folder = base_path / "faces"
df_folder = base_path / "results_dbscan" /"df"

model_name="Facenet"

# ==================== DBSCAN ====================
clustering_algo = "DBSCAN"
distance_metric = "cosine"

# loop over all files in df_folder
for cluster_path in tqdm(list(df_folder.glob("*.csv"))):
    # split the file name
    try: 
        c_model_name, c_clustering_algo, c_distance_metric, c_min_samples, c_threshold = cluster_path.stem.split("_")[1:]
        c_threshold = float(c_threshold)
    except:
        # depending on the algo used, the number of elements in the split list may vary
        # the above split is for DBSCAN
        continue
    
    # only consider DBSCAN
    if c_clustering_algo != clustering_algo:
        continue
    # only use cosine distance
    if c_distance_metric != distance_metric:
        continue
    # only use Facenet512
    if c_model_name != model_name:
        continue

    # if result is already computed, skip
    # already in df_summary
    # check if the summary already exists
    summary_path = pathlib.Path(f"compared_w_40k_{clustering_algo}_{c_distance_metric}_{model_name}.csv")
    if summary_path.exists():
        # print("file exists")
        df_summary = pd.read_csv(summary_path)

        # check if results is aleady computed
        should_skip = (df_summary[(df_summary["clustering_algo"] == c_clustering_algo) & (df_summary["model_name"] == c_model_name) & (df_summary["distance_metric"] == c_distance_metric) & (df_summary["min_samples"] == int(c_min_samples)) & (df_summary["threshold"] == c_threshold)]).shape[0] > 0

        if should_skip:
            continue

    # read the cluster file
    df = pd.read_csv(cluster_path, usecols=["image", "cluster_label"])
    df["face_id"] = df["image"].apply(pathlib.Path).apply(lambda x: x.stem)

    total_tp, total_fn, total_fp, df_stats = compare_w_ref(reference_clusters_path, df, faces_folder=faces_folder, src_folder=src_folder)

    total_precision = total_tp / (total_tp + total_fp)
    total_recall = total_tp / (total_tp + total_fn)
    total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)

    summary = {
        "model_name": c_model_name,
        "clustering_algo": c_clustering_algo,
        "distance_metric": c_distance_metric,
        "min_samples": c_min_samples,
        "threshold": c_threshold,
        "total_tp": total_tp,
        "total_fn": total_fn,
        "total_fp": total_fp,
        "precision": total_precision,
        "recall": total_recall,
        "f1": total_f1
    }

    if summary_path.exists():
        df_summary = pd.read_csv(summary_path)
        df_summary = pd.concat([df_summary, pd.DataFrame([summary])])
        # df_summary = df_summary.drop_duplicates()
    else:
        df_summary = pd.DataFrame([summary])

    df_summary.to_csv(summary_path, index=False)
    # break

100%|██████████| 180/180 [02:35<00:00,  1.16it/s]


## DBSCAN + AHC

In [26]:
reference_clusters_path = pathlib.Path("../reference_clusters")

base_path = pathlib.Path("/media/bao/t7/la_lib_dataset")

src_folder = base_path / "img"
faces_folder = base_path / "faces"
df_folder = base_path / "results_dbscan" /"dbscan_ahc"

model_name="Facenet512"

# ==================== DBSCAN ====================
clustering_algo = "DBSCAN"
distance_metric = "cosine"

# loop over all files in df_folder
for cluster_path in tqdm(list(df_folder.glob("*.csv"))):
    # split the file name
    try: 
        # cluster_DBSCAN_AHC_cosine_complete_0.34
        c_clustering_algo1, c_clustering_algo2, c_distance_metric, c_linkage, c_threshold = cluster_path.stem.split("_")[1:]
        c_threshold = float(c_threshold)
    except:
        # depending on the algo used, the number of elements in the split list may vary
        # the above split is for DBSCAN
        continue
    
    # only consider DBSCAN + AHC
    if c_clustering_algo1 != "DBSCAN":
        continue
    
    if c_clustering_algo2 != "AHC":
        continue
    
    # only use cosine distance
    if c_distance_metric != distance_metric:
        continue

    # if result is already computed, skip
    # already in df_summary
    # check if the summary already exists
    summary_path = pathlib.Path(f"compared_w_40k_DBSCAN+AHC_{c_distance_metric}.csv")
    if summary_path.exists():
        # print("file exists")
        df_summary = pd.read_csv(summary_path)

        # check if results is aleady computed
        should_skip = (df_summary[(df_summary["clustering_algo"] == "DBSCAN+AHC") & (df_summary["model_name"] == model_name) & (df_summary["distance_metric"] == c_distance_metric) & (df_summary["linkage"] == c_linkage) & (df_summary["threshold"] == c_threshold)]).shape[0] > 0

        if should_skip:
            continue

    # read the cluster file
    df = pd.read_csv(cluster_path, usecols=["image", "cluster_label"])
    df["face_id"] = df["image"].apply(pathlib.Path).apply(lambda x: x.stem)

    total_tp, total_fn, total_fp, df_stats = compare_w_ref(reference_clusters_path, df, faces_folder=faces_folder, src_folder=src_folder)

    total_precision = total_tp / (total_tp + total_fp)
    total_recall = total_tp / (total_tp + total_fn)
    total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)

    summary = {
        "model_name": model_name,
        "clustering_algo": "DBSCAN+AHC",
        "distance_metric": c_distance_metric,
        "linkage": c_linkage,
        "threshold": c_threshold,
        "total_tp": total_tp,
        "total_fn": total_fn,
        "total_fp": total_fp,
        "precision": total_precision,
        "recall": total_recall,
        "f1": total_f1
    }

    if summary_path.exists():
        df_summary = pd.read_csv(summary_path)
        df_summary = pd.concat([df_summary, pd.DataFrame([summary])])
        # df_summary = df_summary.drop_duplicates()
    else:
        df_summary = pd.DataFrame([summary])

    df_summary.to_csv(summary_path, index=False)
    # break

100%|██████████| 33/33 [01:09<00:00,  2.10s/it]


# compare with reference

In [16]:
df_summary["threshold"].unique()

array([0.18  , 0.19  , 0.191 , 0.192 , 0.1925, 0.195 , 0.1975, 0.2   ,
       0.21  , 0.22  , 0.23  , 0.235 , 0.24  , 0.245 , 0.25  , 0.26  ,
       0.27  , 0.28  , 0.29  , 0.3   , 0.2375])

In [20]:
df_summary.sort_values(by=["precision", "f1"], ascending=False)[:10]

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
4,Facenet512,DBSCAN,cosine,2,0.1925,775,261,0,1.0,0.748069,0.855881
68,Facenet512,DBSCAN,cosine,3,0.1925,775,261,0,1.0,0.748069,0.855881
3,Facenet512,DBSCAN,cosine,2,0.192,774,262,0,1.0,0.747104,0.855249
67,Facenet512,DBSCAN,cosine,3,0.192,774,262,0,1.0,0.747104,0.855249
2,Facenet512,DBSCAN,cosine,2,0.191,768,268,0,1.0,0.741313,0.851441
66,Facenet512,DBSCAN,cosine,3,0.191,768,268,0,1.0,0.741313,0.851441
1,Facenet512,DBSCAN,cosine,2,0.19,763,273,0,1.0,0.736486,0.848249
65,Facenet512,DBSCAN,cosine,3,0.19,763,273,0,1.0,0.736486,0.848249
0,Facenet512,DBSCAN,cosine,2,0.18,719,317,0,1.0,0.694015,0.819373
64,Facenet512,DBSCAN,cosine,3,0.18,719,317,0,1.0,0.694015,0.819373


In [18]:
df_summary.sort_values(by="f1", ascending=False)[:10]

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
41,Facenet512,DBSCAN,cosine,5,0.24,955,81,32,0.967579,0.921815,0.944142
52,Facenet512,DBSCAN,cosine,5,0.2375,942,94,28,0.971134,0.909266,0.939182
34,Facenet512,DBSCAN,cosine,4,0.24,956,80,47,0.953141,0.92278,0.937715
32,Facenet512,DBSCAN,cosine,4,0.23,930,106,20,0.978947,0.897683,0.936556
10,Facenet512,DBSCAN,cosine,2,0.23,939,97,31,0.968041,0.906371,0.936191
74,Facenet512,DBSCAN,cosine,3,0.23,939,97,31,0.968041,0.906371,0.936191
33,Facenet512,DBSCAN,cosine,4,0.235,939,97,36,0.963077,0.906371,0.933864
39,Facenet512,DBSCAN,cosine,3,0.24,962,74,63,0.938537,0.928571,0.933527
12,Facenet512,DBSCAN,cosine,2,0.24,962,74,63,0.938537,0.928571,0.933527
0,Facenet512,DBSCAN,cosine,3,0.235,945,91,48,0.951662,0.912162,0.931493


# Strategy 0: DBSCAN

In [3]:
best = pd.DataFrame(columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])

## DBSCAN: cosine, facenet512

In [4]:
df_summary = pd.read_csv("compared_w_40k_DBSCAN_cosine_Facenet512.csv")

In [5]:
df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5]

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
4,Facenet512,DBSCAN,cosine,2,0.1925,775,261,0,1.0,0.748069,0.855881
68,Facenet512,DBSCAN,cosine,3,0.1925,775,261,0,1.0,0.748069,0.855881
3,Facenet512,DBSCAN,cosine,2,0.192,774,262,0,1.0,0.747104,0.855249
67,Facenet512,DBSCAN,cosine,3,0.192,774,262,0,1.0,0.747104,0.855249
2,Facenet512,DBSCAN,cosine,2,0.191,768,268,0,1.0,0.741313,0.851441


In [6]:
# format float to 4 decimal places
print(df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))

\begin{tabular}{rrrrrrrr}
\toprule
 min\_samples &  threshold &  total\_tp &  total\_fn &  total\_fp &  precision &  recall &     f1 \\
\midrule
           2 &     0.1925 &       775 &       261 &         0 &     1.0000 &  0.7481 & 0.8559 \\
           3 &     0.1925 &       775 &       261 &         0 &     1.0000 &  0.7481 & 0.8559 \\
           2 &     0.1920 &       774 &       262 &         0 &     1.0000 &  0.7471 & 0.8552 \\
           3 &     0.1920 &       774 &       262 &         0 &     1.0000 &  0.7471 & 0.8552 \\
           2 &     0.1910 &       768 &       268 &         0 &     1.0000 &  0.7413 & 0.8514 \\
\bottomrule
\end{tabular}



  print(df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))


In [7]:
best = pd.concat([best, pd.DataFrame([["0.512 (pre)", 775, 261, 0, 1.0, 0.748069, 0.855881]], columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])])

In [8]:
df_summary.sort_values(by="f1", ascending=False)[:5]

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
41,Facenet512,DBSCAN,cosine,5,0.24,955,81,32,0.967579,0.921815,0.944142
52,Facenet512,DBSCAN,cosine,5,0.2375,942,94,28,0.971134,0.909266,0.939182
34,Facenet512,DBSCAN,cosine,4,0.24,956,80,47,0.953141,0.92278,0.937715
32,Facenet512,DBSCAN,cosine,4,0.23,930,106,20,0.978947,0.897683,0.936556
10,Facenet512,DBSCAN,cosine,2,0.23,939,97,31,0.968041,0.906371,0.936191


In [9]:
best = pd.concat([best, pd.DataFrame([["0.512 (f1)", 955, 81, 32, 0.967579, 0.921815, 0.944142]], columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])])

In [10]:
# format float to 4 decimal places
print(df_summary.sort_values(by=["f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))

\begin{tabular}{rrrrrrrr}
\toprule
 min\_samples &  threshold &  total\_tp &  total\_fn &  total\_fp &  precision &  recall &     f1 \\
\midrule
           5 &     0.2400 &       955 &        81 &        32 &     0.9676 &  0.9218 & 0.9441 \\
           5 &     0.2375 &       942 &        94 &        28 &     0.9711 &  0.9093 & 0.9392 \\
           4 &     0.2400 &       956 &        80 &        47 &     0.9531 &  0.9228 & 0.9377 \\
           4 &     0.2300 &       930 &       106 &        20 &     0.9789 &  0.8977 & 0.9366 \\
           2 &     0.2300 &       939 &        97 &        31 &     0.9680 &  0.9064 & 0.9362 \\
\bottomrule
\end{tabular}



  print(df_summary.sort_values(by=["f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))


## DBSCAN: cosine, facenet(128)

In [11]:
df_summary = pd.read_csv("compared_w_40k_DBSCAN_cosine_Facenet.csv")

In [12]:
df_summary.sort_values(by="threshold", ascending=True)[:5]

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
3,Facenet,DBSCAN,cosine,2,0.16,579,457,24904,0.022721,0.55888,0.043667
54,Facenet,DBSCAN,cosine,5,0.16,880,156,1051991,0.000836,0.849421,0.00167
37,Facenet,DBSCAN,cosine,4,0.16,765,271,677736,0.001127,0.738417,0.002252
20,Facenet,DBSCAN,cosine,3,0.16,653,383,283567,0.002298,0.630309,0.004578
73,Facenet,DBSCAN,cosine,4,0.165,763,273,644995,0.001182,0.736486,0.002359


In [13]:
print(df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))

\begin{tabular}{rrrrrrrr}
\toprule
 min\_samples &  threshold &  total\_tp &  total\_fn &  total\_fp &  precision &  recall &     f1 \\
\midrule
           2 &     0.1700 &       610 &       426 &         1 &     0.9984 &  0.5888 & 0.7407 \\
           2 &     0.1675 &       597 &       439 &         1 &     0.9983 &  0.5763 & 0.7307 \\
           2 &     0.1975 &       721 &       315 &         2 &     0.9972 &  0.6959 & 0.8198 \\
           2 &     0.1950 &       710 &       326 &         2 &     0.9972 &  0.6853 & 0.8124 \\
           2 &     0.1925 &       700 &       336 &         2 &     0.9972 &  0.6757 & 0.8055 \\
\bottomrule
\end{tabular}



  print(df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))


In [14]:
df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5]

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
4,Facenet,DBSCAN,cosine,2,0.17,610,426,1,0.998363,0.588803,0.740741
70,Facenet,DBSCAN,cosine,2,0.1675,597,439,1,0.998328,0.576255,0.730722
9,Facenet,DBSCAN,cosine,2,0.1975,721,315,2,0.997234,0.695946,0.819784
8,Facenet,DBSCAN,cosine,2,0.195,710,326,2,0.997191,0.685328,0.812357
7,Facenet,DBSCAN,cosine,2,0.1925,700,336,2,0.997151,0.675676,0.805524


In [15]:
print(df_summary.sort_values(by=["f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))

\begin{tabular}{rrrrrrrr}
\toprule
 min\_samples &  threshold &  total\_tp &  total\_fn &  total\_fp &  precision &  recall &     f1 \\
\midrule
           2 &     0.2200 &       804 &       232 &         7 &     0.9914 &  0.7761 & 0.8706 \\
           3 &     0.2200 &       804 &       232 &         7 &     0.9914 &  0.7761 & 0.8706 \\
           2 &     0.2100 &       769 &       267 &         6 &     0.9923 &  0.7423 & 0.8493 \\
           2 &     0.2000 &       728 &       308 &         5 &     0.9932 &  0.7027 & 0.8231 \\
           2 &     0.1975 &       721 &       315 &         2 &     0.9972 &  0.6959 & 0.8198 \\
\bottomrule
\end{tabular}



  print(df_summary.sort_values(by=["f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))


In [16]:
best = pd.concat([best, pd.DataFrame([["0.128 (pre)", 610, 426, 1, 	0.998363, 0.588803, 0.740741]], columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])])

In [17]:
best = pd.concat([best, pd.DataFrame([["0.128 (f1)", 804, 232, 7, 0.991369, 0.776062, 0.870601]], columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])])

In [18]:
df_summary.sort_values(by="f1", ascending=False)[:5]

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
11,Facenet,DBSCAN,cosine,2,0.22,804,232,7,0.991369,0.776062,0.870601
1,Facenet,DBSCAN,cosine,3,0.22,804,232,7,0.991369,0.776062,0.870601
10,Facenet,DBSCAN,cosine,2,0.21,769,267,6,0.992258,0.742278,0.849255
0,Facenet,DBSCAN,cosine,2,0.2,728,308,5,0.993179,0.702703,0.823064
9,Facenet,DBSCAN,cosine,2,0.1975,721,315,2,0.997234,0.695946,0.819784


# Strategy 3 : DBSCAN + AHC

* Facenet512
* DBSCAN: cosine, min_samples=5, eps=0.24 (threshold) => best f1 score
* AHC: cosine + various linkage and threshold

In [19]:
df_summary = pd.read_csv("compared_w_40k_DBSCAN+AHC_cosine.csv")

In [20]:
df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5]

Unnamed: 0,model_name,clustering_algo,distance_metric,linkage,threshold,total_tp,total_fn,total_fp,precision,recall,f1
25,Facenet512,DBSCAN+AHC,cosine,average,0.24,677,359,0,1.0,0.653475,0.790426
24,Facenet512,DBSCAN+AHC,cosine,average,0.23,659,377,0,1.0,0.6361,0.777581
23,Facenet512,DBSCAN+AHC,cosine,average,0.22,629,407,0,1.0,0.607143,0.755556
22,Facenet512,DBSCAN+AHC,cosine,average,0.21,612,424,0,1.0,0.590734,0.742718
21,Facenet512,DBSCAN+AHC,cosine,average,0.2,591,445,0,1.0,0.570463,0.72649


In [21]:
df_summary[df_summary["linkage"] == "complete"].sort_values(by=["precision", "f1"], ascending=False)[:5]

Unnamed: 0,model_name,clustering_algo,distance_metric,linkage,threshold,total_tp,total_fn,total_fp,precision,recall,f1
11,Facenet512,DBSCAN+AHC,cosine,complete,0.26,581,455,0,1.0,0.560811,0.718615
10,Facenet512,DBSCAN+AHC,cosine,complete,0.25,571,465,0,1.0,0.551158,0.710641
9,Facenet512,DBSCAN+AHC,cosine,complete,0.24,562,474,0,1.0,0.542471,0.703379
8,Facenet512,DBSCAN+AHC,cosine,complete,0.23,551,485,0,1.0,0.531853,0.694392
7,Facenet512,DBSCAN+AHC,cosine,complete,0.22,539,497,0,1.0,0.52027,0.684444


In [22]:
print(df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["linkage", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))

\begin{tabular}{lrrrrrrr}
\toprule
linkage &  threshold &  total\_tp &  total\_fn &  total\_fp &  precision &  recall &     f1 \\
\midrule
average &     0.2400 &       677 &       359 &         0 &     1.0000 &  0.6535 & 0.7904 \\
average &     0.2300 &       659 &       377 &         0 &     1.0000 &  0.6361 & 0.7776 \\
average &     0.2200 &       629 &       407 &         0 &     1.0000 &  0.6071 & 0.7556 \\
average &     0.2100 &       612 &       424 &         0 &     1.0000 &  0.5907 & 0.7427 \\
average &     0.2000 &       591 &       445 &         0 &     1.0000 &  0.5705 & 0.7265 \\
\bottomrule
\end{tabular}



  print(df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["linkage", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))


In [23]:
best = pd.concat([best, pd.DataFrame([["3 (pre)", 677, 359, 0, 	1.0, 0.653475, 0.790426]], columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])])

In [24]:
best = pd.concat([best, pd.DataFrame([["3 (f1)", 920, 116, 13, 0.986066, 0.888031, 0.934485]], columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])])

In [25]:
df_summary.sort_values(by=["f1"], ascending=False)[:5]

Unnamed: 0,model_name,clustering_algo,distance_metric,linkage,threshold,total_tp,total_fn,total_fp,precision,recall,f1
4,Facenet512,DBSCAN+AHC,cosine,average,0.35,920,116,13,0.986066,0.888031,0.934485
3,Facenet512,DBSCAN+AHC,cosine,average,0.34,910,126,10,0.98913,0.878378,0.93047
2,Facenet512,DBSCAN+AHC,cosine,average,0.33,900,136,9,0.990099,0.868726,0.92545
1,Facenet512,DBSCAN+AHC,cosine,average,0.32,884,152,6,0.993258,0.853282,0.917965
0,Facenet512,DBSCAN+AHC,cosine,average,0.31,871,165,6,0.993158,0.840734,0.910612


# Strategy 1: DBSCAN overlap

In [26]:
df_summary = pd.read_csv("/media/bao/t7/la_lib_dataset/results_dbscan/dbscanx2/summary_DBSCANx2.csv")

df_summary

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
0,Facenet512 + Facenet128,DBSCANx2,cosine,"[5, 2]","[0.24, 0.22]",799,237,0,1.0,0.771236,0.870845


In [27]:
print(df_summary.sort_values(by=["f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))

\begin{tabular}{llrrrrrr}
\toprule
min\_samples &    threshold &  total\_tp &  total\_fn &  total\_fp &  precision &  recall &     f1 \\
\midrule
     [5, 2] & [0.24, 0.22] &       799 &       237 &         0 &     1.0000 &  0.7712 & 0.8708 \\
\bottomrule
\end{tabular}



  print(df_summary.sort_values(by=["f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))


In [28]:
best = pd.concat([best, pd.DataFrame([["1", 799, 237, 0, 	1.0, 0.771236, 0.870845]], columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])])

# Strategy 2: DBSCAN + threshold

In [29]:
df_summary = pd.read_csv("/media/bao/t7/la_lib_dataset/results_dbscan/dbscanx2/summary_DBSCANx2_threshold.csv")

df_summary

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
0,Facenet512 + Facenet128,DBSCAN + d_th 0.30,cosine,"[5, 2]","[0.24, 0.22]",888,148,5,0.994401,0.857143,0.920684
1,Facenet512 + Facenet128,DBSCAN + d_th 0.20,cosine,"[5, 2]","[0.24, 0.22]",888,148,5,0.994401,0.857143,0.920684
2,Facenet512 + Facenet128,DBSCAN + d_th 0.15,cosine,"[5, 2]","[0.24, 0.22]",888,148,5,0.994401,0.857143,0.920684


In [30]:
print(df_summary.sort_values(by=["f1"], ascending=False)[:1].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))

\begin{tabular}{llrrrrrr}
\toprule
min\_samples &    threshold &  total\_tp &  total\_fn &  total\_fp &  precision &  recall &     f1 \\
\midrule
     [5, 2] & [0.24, 0.22] &       888 &       148 &         5 &     0.9944 &  0.8571 & 0.9207 \\
\bottomrule
\end{tabular}



  print(df_summary.sort_values(by=["f1"], ascending=False)[:1].to_latex(index=False, float_format="%.4f", columns=["min_samples", "threshold", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))


In [31]:
best = pd.concat([best, pd.DataFrame([["2", 888, 148, 5, 	0.994401, 0.857143, 0.920684]], columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])])

## Strat 2 b (with a min threshold)

In [32]:
df_summary = pd.read_csv("/media/bao/t7/la_lib_dataset/results_dbscan/dbscanx2/summary_DBSCANx2_lthreshold.csv")

df_summary

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
0,Facenet512 + Facenet128,DBSCAN + l_th 0.15,cosine,"[5, 2]","[0.24, 0.22]",910,126,5,0.994536,0.878378,0.932855
1,Facenet512 + Facenet128,DBSCAN + l_th 0.16,cosine,"[5, 2]","[0.24, 0.22]",912,124,6,0.993464,0.880309,0.93347
2,Facenet512 + Facenet128,DBSCAN + l_th 0.17,cosine,"[5, 2]","[0.24, 0.22]",914,122,6,0.993478,0.882239,0.93456
3,Facenet512 + Facenet128,DBSCAN + l_th 0.18,cosine,"[5, 2]","[0.24, 0.22]",917,119,8,0.991351,0.885135,0.935237
4,Facenet512 + Facenet128,DBSCAN + l_th 0.19,cosine,"[5, 2]","[0.24, 0.22]",923,113,9,0.990343,0.890927,0.938008
5,Facenet512 + Facenet128,DBSCAN + l_th 0.2,cosine,"[5, 2]","[0.24, 0.22]",925,111,11,0.988248,0.892857,0.938134
6,Facenet512 + Facenet128,DBSCAN + l_th 0.21,cosine,"[5, 2]","[0.24, 0.22]",929,107,12,0.987248,0.896718,0.939808
7,Facenet512 + Facenet128,DBSCAN + l_th 0.22,cosine,"[5, 2]","[0.24, 0.22]",938,98,16,0.983229,0.905405,0.942714
8,Facenet512 + Facenet128,DBSCAN + l_th 0.23,cosine,"[5, 2]","[0.24, 0.22]",942,94,18,0.98125,0.909266,0.943888
9,Facenet512 + Facenet128,DBSCAN + l_th 0.24,cosine,"[5, 2]","[0.24, 0.22]",943,93,19,0.980249,0.910232,0.943944


In [33]:
df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5]

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
0,Facenet512 + Facenet128,DBSCAN + l_th 0.15,cosine,"[5, 2]","[0.24, 0.22]",910,126,5,0.994536,0.878378,0.932855
11,Facenet512 + Facenet128,DBSCAN + l_th 0.14,cosine,"[5, 2]","[0.24, 0.22]",906,130,5,0.994512,0.874517,0.930663
12,Facenet512 + Facenet128,DBSCAN + l_th 0.13,cosine,"[5, 2]","[0.24, 0.22]",896,140,5,0.994451,0.864865,0.925142
13,Facenet512 + Facenet128,DBSCAN + l_th 0.12,cosine,"[5, 2]","[0.24, 0.22]",891,145,5,0.99442,0.860039,0.92236
14,Facenet512 + Facenet128,DBSCAN + l_th 0.11,cosine,"[5, 2]","[0.24, 0.22]",891,145,5,0.99442,0.860039,0.92236


In [34]:
best = pd.concat([best, pd.DataFrame([["2v (pre)", 910, 126, 5, 0.994536, 0.878378, 0.932855]], columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])])

In [35]:
best = pd.concat([best, pd.DataFrame([["2v (f1)", 946, 90, 19, 0.980311, 0.913127, 0.945527]], columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])])

In [36]:
df_summary.sort_values(by=["f1"], ascending=False)[:5]

Unnamed: 0,model_name,clustering_algo,distance_metric,min_samples,threshold,total_tp,total_fn,total_fp,precision,recall,f1
10,Facenet512 + Facenet128,DBSCAN + l_th 0.25,cosine,"[5, 2]","[0.24, 0.22]",946,90,19,0.980311,0.913127,0.945527
9,Facenet512 + Facenet128,DBSCAN + l_th 0.24,cosine,"[5, 2]","[0.24, 0.22]",943,93,19,0.980249,0.910232,0.943944
8,Facenet512 + Facenet128,DBSCAN + l_th 0.23,cosine,"[5, 2]","[0.24, 0.22]",942,94,18,0.98125,0.909266,0.943888
7,Facenet512 + Facenet128,DBSCAN + l_th 0.22,cosine,"[5, 2]","[0.24, 0.22]",938,98,16,0.983229,0.905405,0.942714
6,Facenet512 + Facenet128,DBSCAN + l_th 0.21,cosine,"[5, 2]","[0.24, 0.22]",929,107,12,0.987248,0.896718,0.939808


In [37]:
print(df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["clustering_algo", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))

\begin{tabular}{lrrrrrr}
\toprule
   clustering\_algo &  total\_tp &  total\_fn &  total\_fp &  precision &  recall &     f1 \\
\midrule
DBSCAN + l\_th 0.15 &       910 &       126 &         5 &     0.9945 &  0.8784 & 0.9329 \\
DBSCAN + l\_th 0.14 &       906 &       130 &         5 &     0.9945 &  0.8745 & 0.9307 \\
DBSCAN + l\_th 0.13 &       896 &       140 &         5 &     0.9945 &  0.8649 & 0.9251 \\
DBSCAN + l\_th 0.12 &       891 &       145 &         5 &     0.9944 &  0.8600 & 0.9224 \\
DBSCAN + l\_th 0.11 &       891 &       145 &         5 &     0.9944 &  0.8600 & 0.9224 \\
\bottomrule
\end{tabular}



  print(df_summary.sort_values(by=["precision", "f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["clustering_algo", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))


In [38]:
print(df_summary.sort_values(by=["f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["clustering_algo", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))

\begin{tabular}{lrrrrrr}
\toprule
   clustering\_algo &  total\_tp &  total\_fn &  total\_fp &  precision &  recall &     f1 \\
\midrule
DBSCAN + l\_th 0.25 &       946 &        90 &        19 &     0.9803 &  0.9131 & 0.9455 \\
DBSCAN + l\_th 0.24 &       943 &        93 &        19 &     0.9802 &  0.9102 & 0.9439 \\
DBSCAN + l\_th 0.23 &       942 &        94 &        18 &     0.9812 &  0.9093 & 0.9439 \\
DBSCAN + l\_th 0.22 &       938 &        98 &        16 &     0.9832 &  0.9054 & 0.9427 \\
DBSCAN + l\_th 0.21 &       929 &       107 &        12 &     0.9872 &  0.8967 & 0.9398 \\
\bottomrule
\end{tabular}



  print(df_summary.sort_values(by=["f1"], ascending=False)[:5].to_latex(index=False, float_format="%.4f", columns=["clustering_algo", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"]))


# strategy 4

In [55]:
def format_res(df: pd.DataFrame, n_splits: int, cluster_outliers: str):

    precision_mean = df['precision'].mean()
    precision_se = df['precision'].sem()

    recall_mean = df['recall'].mean() 
    recall_se = df['recall'].sem()

    f1_mean = df['f1'].mean() 
    f1_se = df['f1'].sem()

    time_mean = df['time'].mean()
    time_se = df['time'].sem()

    output = (
        n_splits, 
        cluster_outliers,
        precision_mean,
        precision_se,
        recall_mean,
        recall_se,
        f1_mean,
        f1_se,
        time_mean,
        time_se
        )
    return output

def get_res(n_splits, cluster_outliers):
    df = pd.read_csv(f'../iterative_clustering/res/res_dm_{n_splits}_{cluster_outliers}.csv')
    return format_res(df, n_splits, cluster_outliers)

df_res = pd.DataFrame(columns=['n_splits', 'cluster_outliers', 'precision_mean', 'precision_se', 'recall_mean', 'recall_se', 'f1_mean', 'f1_se', 'time_mean', 'time_se'])

for n_splits in [2, 3, 4]:
    for cluster_outliers in ['all', 'skip']:
        df_res.loc[len(df_res)] = get_res(n_splits, cluster_outliers)

df_res["precision"] = df_res["precision_mean"].round(4).astype(str) + " +/- " + df_res["precision_se"].round(4).astype(str)
df_res["recall"] = df_res["recall_mean"].round(4).astype(str) + " +/- " + df_res["recall_se"].round(4).astype(str)
df_res["f1"] = df_res["f1_mean"].round(4).astype(str) + " +/- " + df_res["f1_se"].round(4).astype(str)
df_res["time"] = df_res["time_mean"].round(2).astype(str) + " +/- " + df_res["time_se"].round(2).astype(str)

In [57]:
best

Unnamed: 0,strategy,total_tp,total_fn,total_fp,precision,recall,f1
0,0.512 (pre),775,261,0,1.0,0.748069,0.855881
0,0.512 (f1),955,81,32,0.967579,0.921815,0.944142
0,0.128 (pre),610,426,1,0.998363,0.588803,0.740741
0,0.128 (f1),804,232,7,0.991369,0.776062,0.870601
0,3 (pre),677,359,0,1.0,0.653475,0.790426
0,3 (f1),920,116,13,0.986066,0.888031,0.934485
0,1,799,237,0,1.0,0.771236,0.870845
0,2,888,148,5,0.994401,0.857143,0.920684
0,2v (pre),910,126,5,0.994536,0.878378,0.932855
0,2v (f1),946,90,19,0.980311,0.913127,0.945527


In [62]:
# add to best 
best = pd.concat([best, pd.DataFrame([["4", np.NaN, np.NaN, np.NaN, df_res.sort_values(by=["precision_mean", "f1_mean"], ascending=False).iloc[0]['precision_mean'], df_res.sort_values(by=["precision_mean", "f1_mean"], ascending=False).iloc[0]['recall_mean'], df_res.sort_values(by=["precision_mean", "f1_mean"], ascending=False).iloc[0]['f1_mean']]], columns=["strategy", "total_tp", "total_fn", "total_fp", "precision", "recall", "f1"])])

# Summary

In [63]:
best.sort_values(by=["precision", "f1"], ascending=False)

Unnamed: 0,strategy,total_tp,total_fn,total_fp,precision,recall,f1
0,1,799.0,237.0,0.0,1.0,0.771236,0.870845
0,0.512 (pre),775.0,261.0,0.0,1.0,0.748069,0.855881
0,3 (pre),677.0,359.0,0.0,1.0,0.653475,0.790426
0,0.128 (pre),610.0,426.0,1.0,0.998363,0.588803,0.740741
0,2v (pre),910.0,126.0,5.0,0.994536,0.878378,0.932855
0,2,888.0,148.0,5.0,0.994401,0.857143,0.920684
0,0.128 (f1),804.0,232.0,7.0,0.991369,0.776062,0.870601
0,3 (f1),920.0,116.0,13.0,0.986066,0.888031,0.934485
0,2v (f1),946.0,90.0,19.0,0.980311,0.913127,0.945527
0,4,,,,0.978697,0.753861,0.85169


In [68]:
best.sort_values(by=["f1"], ascending=False)

Unnamed: 0,strategy,total_tp,total_fn,total_fp,precision,recall,f1
0,2v (f1),946.0,90.0,19.0,0.980311,0.913127,0.945527
0,0.512 (f1),955.0,81.0,32.0,0.967579,0.921815,0.944142
0,3 (f1),920.0,116.0,13.0,0.986066,0.888031,0.934485
0,2v (pre),910.0,126.0,5.0,0.994536,0.878378,0.932855
0,2,888.0,148.0,5.0,0.994401,0.857143,0.920684
0,1,799.0,237.0,0.0,1.0,0.771236,0.870845
0,0.128 (f1),804.0,232.0,7.0,0.991369,0.776062,0.870601
0,0.512 (pre),775.0,261.0,0.0,1.0,0.748069,0.855881
0,4,,,,0.978697,0.753861,0.85169
0,3 (pre),677.0,359.0,0.0,1.0,0.653475,0.790426


In [66]:
print(best.sort_values(by=["strategy"], ascending=True).to_latex(index=False, columns=['strategy', 'precision', 'recall', 'f1']))

\begin{tabular}{lrrr}
\toprule
   strategy &  precision &   recall &       f1 \\
\midrule
 0.128 (f1) &   0.991369 & 0.776062 & 0.870601 \\
0.128 (pre) &   0.998363 & 0.588803 & 0.740741 \\
 0.512 (f1) &   0.967579 & 0.921815 & 0.944142 \\
0.512 (pre) &   1.000000 & 0.748069 & 0.855881 \\
          1 &   1.000000 & 0.771236 & 0.870845 \\
          2 &   0.994401 & 0.857143 & 0.920684 \\
    2v (f1) &   0.980311 & 0.913127 & 0.945527 \\
   2v (pre) &   0.994536 & 0.878378 & 0.932855 \\
     3 (f1) &   0.986066 & 0.888031 & 0.934485 \\
    3 (pre) &   1.000000 & 0.653475 & 0.790426 \\
          4 &   0.978697 & 0.753861 & 0.851690 \\
\bottomrule
\end{tabular}



  print(best.sort_values(by=["strategy"], ascending=True).to_latex(index=False, columns=['strategy', 'precision', 'recall', 'f1']))


# figure

In [49]:
# df_stats["n_images"].sum()

In [50]:
# df_stats["cluster_ref_id"].nunique()

In [51]:
# df_stats.sort_values("fp", ascending=False)[0:10]

In [52]:
# n = 54

# for idx, row in df_stats.sort_values("fp", ascending=False)[n:n+1].iterrows():
#     cluster_ref_id = row["cluster_ref_id"]
#     dominant_cluster = row["dominant_cluster"]
#     fp = row["fp"]
#     print(cluster_ref_id, dominant_cluster, fp)
#     with open(f"../reference_clusters/cluster_{cluster_ref_id}.json") as f:
#         ref_cluster = json.load(f)

#     id_to_mark = [faceId_to_ogId(x) for x in ref_cluster]

#     _ , _ = show_cluster(df=df, cluster_id=dominant_cluster, faces_folder=faces_folder, originals_folder=src_folder, limit=50, ncol=5, show_original=True, plot=True, save_folder = None, hide_axis=False, title_col = None, marked = id_to_mark)

#     ids, _ = show_cluster(df=df, cluster_id=dominant_cluster, faces_folder=faces_folder, originals_folder=src_folder, limit=50, ncol=5, show_original=False, plot=True, save_folder = None, hide_axis=False, title_col = None, marked = id_to_mark)

In [53]:
# add ids[10] to the reference cluster
# ref_cluster.append(ids[4])

# # save the new reference cluster
# with open(f"../reference_clusters/cluster_{cluster_ref_id}.json", "w") as f:
#     json.dump(ref_cluster, f)

In [54]:
# fig, ax = plt.subplots(figsize=(15, 5))
# df_stats.sort_values(by="n_images", ascending=False).plot(x="cluster_ref_id", y="n_images", kind="bar", ax=ax)