In [1]:
import pathlib
import pandas as pd
import json
from tqdm import tqdm
from sklearn.model_selection import KFold 

import sys
sys.path.append("../")

from face_clustering_pipeline import FaceClusteringPipeline
from iterative_clustering.iterative_clustering import new_and_outliers_clustering
from helper.compare_w_reference import compare_w_ref

In [2]:
base_path = pathlib.Path("/media/bao/t7/la_lib_dataset")

src_folder = base_path / "img"
faces_folder = base_path / "faces"
save_folder = base_path / "results_iterative_clustering"

df_folder = save_folder / "df_new+outliers"
log_folder = save_folder / "log_new+outliers"

df_folder.mkdir(exist_ok=True, parents=True)
log_folder.mkdir(exist_ok=True, parents=True)

In [3]:
faceClusteringPipeline = FaceClusteringPipeline(src_folder, faces_folder, df_folder, log_folder)

In [4]:
# Loading the (pre-computed) embeddings 
model_name = "Facenet512"
df = pd.read_csv(base_path / 'results_dbscan/df/keep_representation_Facenet512.csv', converters={f"{model_name}_representation": json.loads}, usecols=["image", f"{model_name}_representation"])

In [7]:
# random_state is set for reproducibility
for n_splits in tqdm([10, 50, 100]):
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True) # we shuffle the data to avoid bias

    for cluster_outlier in ["all"]:
            precisions = []
            recalls = []
            f1s = []
            times = []

            for idx, (index_baseline, index_1by1) in enumerate(kf.split(df)):

                if idx >= 10:
                    break
                
                df_baseline: pd.DataFrame = df.iloc[index_baseline].copy()
                df_1by1: pd.DataFrame = df.iloc[index_1by1].copy()

                df_splits = [df_baseline, df_1by1]

                total_t = 0
                df_res, t_res = new_and_outliers_clustering(df_splits, faceClusteringPipeline=faceClusteringPipeline, df_folder=df_folder, model_name=model_name)
                total_t = t_res[1]
                
                df_res["face_id"] = df_res["image"].apply(pathlib.Path).apply(lambda x: x.stem)

                reference_clusters_path = pathlib.Path("../reference_clusters")

                total_tp, total_fn, total_fp, df_stats = compare_w_ref(reference_clusters_path, df_res, faces_folder=faces_folder, src_folder=src_folder)

                total_precision = total_tp / (total_tp + total_fp)
                total_recall = total_tp / (total_tp + total_fn)
                total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)

                precisions.append(total_precision)
                recalls.append(total_recall)
                f1s.append(total_f1)
                times.append(total_t)

            df_results = pd.DataFrame({
                "precision": precisions,
                "recall": recalls,
                "f1": f1s,
                "time": times
            })

            df_results.to_csv(f"res/res_newout_{n_splits}_{cluster_outlier}.csv", index=False)

100%|██████████| 3/3 [13:30<00:00, 270.22s/it]
