In [1]:
import pathlib
import pandas as pd
import json
import cv2
import numpy as np
from tqdm import tqdm
from time import perf_counter
from sklearn.model_selection import KFold 

import sys
sys.path.append("../")

from iterative_clustering.iterative_clustering import split_clustering, merge_clustering
from face_clustering_pipeline import FaceClusteringPipeline
from helper.compare_w_reference import compare_w_ref

* Divide the full dataset into <n> subsets
* Cluster each subset using DBSCAN (best f1 parameter)
* Compute the average embedding for each predicted cluster
* Merge the clusters from each subset into a single set of clusters (using the
  average embedding) => should try using either DBSCAN or AHC (average linkage)
* Compute performance metric 

In [2]:
base_path = pathlib.Path("/media/bao/t7/la_lib_dataset")

src_folder = base_path / "img"
faces_folder = base_path / "faces"
save_folder = base_path / "results_iterative_clustering"

df_folder = save_folder / "df_divide_merge"
log_folder = save_folder / "log_divide_merge"

df_folder.mkdir(exist_ok=True, parents=True)
log_folder.mkdir(exist_ok=True, parents=True)

In [3]:
faceClusteringPipeline = FaceClusteringPipeline(src_folder, faces_folder, df_folder, log_folder)

In [4]:
# Loading the (pre-computed) embeddings 
model_name = "Facenet512"
df = pd.read_csv(base_path / 'results_dbscan/df/keep_representation_Facenet512.csv', converters={f"{model_name}_representation": json.loads}, usecols=["image", f"{model_name}_representation"])

In [10]:
# random_state is set for reproducibility
for n_splits in tqdm([10, 50, 100]):
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True) 

    for cluster_outlier in ["all", "skip"]:
            precisions = []
            recalls = []
            f1s = []
            times = []

            for idx, (index_baseline, index_1by1) in enumerate(kf.split(df)):

                if idx >= 10:
                    break
                
                df_baseline: pd.DataFrame = df.iloc[index_baseline].copy()
                df_1by1: pd.DataFrame = df.iloc[index_1by1].copy()

                df_splits = [df_baseline, df_1by1]

                total_t = 0
                df_res, t_res = split_clustering(df_splits, faceClusteringPipeline=faceClusteringPipeline, df_folder = df_folder, model_name=model_name)
                # exclude the first time (which is the original clustering
                # considered already pre-computed)
                total_t = sum(t_res[1:])

                if cluster_outlier == "all":
                    t0 = perf_counter()
                    df_inliers = df_res[df_res["cluster_label"] != -1]
                    df_outliers = df_res[df_res["cluster_label"] == -1][["image", f"{model_name}_representation"]]
                    # cluster the outliers without splitting
                    df_outliers_res, _ = split_clustering([df_outliers])
                    clustering_res = [df_inliers, df_outliers_res]
                    df_res = merge_clustering(clustering_res, df_folder=df_folder, model_name=model_name, faceClusteringPipeline=faceClusteringPipeline)
                    t1 = perf_counter()

                    total_t += t1-t0

                df_res["face_id"] = df_res["image"].apply(pathlib.Path).apply(lambda x: x.stem)

                reference_clusters_path = pathlib.Path("../reference_clusters")

                total_tp, total_fn, total_fp, df_stats = compare_w_ref(reference_clusters_path, df_res, faces_folder=faces_folder, src_folder=src_folder)

                total_precision = total_tp / (total_tp + total_fp)
                total_recall = total_tp / (total_tp + total_fn)
                total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)

                precisions.append(total_precision)
                recalls.append(total_recall)
                f1s.append(total_f1)
                times.append(total_t)

            df_results = pd.DataFrame({
                "precision": precisions,
                "recall": recalls,
                "f1": f1s,
                "time": times
            })

            df_results.to_csv(f"res/res_cm_{n_splits}_{cluster_outlier}.csv", index=False)

100%|██████████| 3/3 [23:13<00:00, 464.34s/it]
