In [7]:
import pathlib
import pandas as pd
import json
import cv2
from time import perf_counter
import numpy as np
from tqdm import tqdm

import sys
sys.path.append("../")

from face_clustering_pipeline import FaceClusteringPipeline
from helper.compare_w_reference import compare_w_ref

# 40k baseline

The baseline used will be strategy 0.512 (f1). The goal is to see if the
iterative method have similar performance to the baseline.

In [2]:
base_path = pathlib.Path("/media/bao/t7/la_lib_dataset")

src_folder = base_path / "img"
faces_folder = base_path / "faces"
save_folder = base_path / "results_iterative_clustering"

df_folder = save_folder / "df"
log_folder = save_folder / "log"

df_folder.mkdir(exist_ok=True, parents=True)
log_folder.mkdir(exist_ok=True, parents=True)

In [3]:
faceClusteringPipeline = FaceClusteringPipeline(src_folder, faces_folder, df_folder, log_folder)

In [6]:
# Loading the (pre-computed) embeddings 
model_name = "Facenet512"
df = pd.read_csv('/media/bao/t7/la_lib_dataset/results_dbscan/df/keep_representation_Facenet512.csv', index_col=0, converters={f"{model_name}_representation": json.loads})

In [8]:
np.random.seed(42)

# Clustering using the best f1 parameters
min_samples = 5
eps = 0.24 # threshold

precisions = []
recalls = []
f1s = []
times = []

for _ in tqdm(range(10)):
    df_input = df.copy()
    # gen random number using numpy
    random_state = np.random.randint(0, 1000)
    df_input = df_input.sample(frac=1, random_state=random_state)

    t0 = perf_counter()
    df_res = faceClusteringPipeline.p_cluster_faces(df_input, df_folder, model_name=model_name, clustering_algo="DBSCAN", distance_metric="cosine", min_samples=5, threshold=0.24, save=False)
    total_t = perf_counter() - t0

    # This column is needed to compare with the reference clusters
    df_res["face_id"] = df_res["image"].apply(pathlib.Path).apply(lambda x: x.stem)

    reference_clusters_path = pathlib.Path("../reference_clusters")

    total_tp, total_fn, total_fp, df_stats = compare_w_ref(reference_clusters_path, df_res, faces_folder=faces_folder, src_folder=src_folder)

    total_precision = total_tp / (total_tp + total_fp)
    total_recall = total_tp / (total_tp + total_fn)
    total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)

    # print("Total precision:", total_precision)
    # print("Total recall:", total_recall)
    # print("Total f1:", total_f1)
    precisions.append(total_precision)
    recalls.append(total_recall)
    f1s.append(total_f1)
    times.append(total_t)

df_results = pd.DataFrame({
    "precision": precisions,
    "recall": recalls,
    "f1": f1s,
    "time": times
})

df_results.to_csv(f"res/res_baseline.csv", index=False)

100%|██████████| 10/10 [02:58<00:00, 17.86s/it]


In [7]:
# Re-clustering only the outliers (cluster_label == -1) should not produce any new cluster 

df_outliers = df_res[df_res["cluster_label"] == -1].copy()

print("All clusters in the outliers:", df_outliers["cluster_label"].unique())

# re-clustering the outliers
df_res_outliers = faceClusteringPipeline.p_cluster_faces(df_outliers, df_folder, model_name=model_name, clustering_algo="DBSCAN", distance_metric="cosine", min_samples=5, threshold=0.24, save=False)

# There should be no new cluster
print("All clusters in the re-clustering of the outliers:", df_res_outliers["cluster_label"].unique())

All clusters in the outliers: [-1]
All clusters in the re-clustering of the outliers: [-1]


In [8]:
df_res["cluster_label"].value_counts()

-1      29094
 0       2233
 21       125
 88        41
 55        39
        ...  
 174        4
 150        4
 229        4
 252        4
 144        2
Name: cluster_label, Length: 277, dtype: int64

In [9]:
# compute the baseline performance
reference_clusters_path = pathlib.Path("../reference_clusters")

total_tp, total_fn, total_fp, df_stats = compare_w_ref(reference_clusters_path, df_res, faces_folder=faces_folder, src_folder=src_folder)

total_precision = total_tp / (total_tp + total_fp)
total_recall = total_tp / (total_tp + total_fn)
total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)

print("Total precision:", total_precision)
print("Total recall:", total_recall)
print("Total f1:", total_f1)

Total precision: 0.9675785207700102
Total recall: 0.9218146718146718
Total f1: 0.944142362827484
