In [1]:
import pathlib
import pandas as pd
from tqdm import tqdm
import json

import sys
sys.path.append("../")

from iterative_clustering.iterative_clustering import clustering_1by1
from face_clustering_pipeline import FaceClusteringPipeline
from helper.compare_w_reference import compare_w_ref

In this strategy, we will split the data (10-fold) and use 9 folds to create the
"baseline" clusters and the remaining fold to be added 1 by 1 to the baseline
clusters. 

We will then cluster all the outliers. If we consistently don't create any
new clusters, this step could be skipped. 

Finally, we will compute the performance of this strategy (and compare it to our
baseline strategy).

In [2]:
base_path = pathlib.Path("/media/bao/t7/la_lib_dataset")

src_folder = base_path / "img"
faces_folder = base_path / "faces"
save_folder = base_path / "results_iterative_clustering"

df_folder = save_folder / "df_1by1"
log_folder = save_folder / "log_1by1"

df_folder.mkdir(exist_ok=True, parents=True)
log_folder.mkdir(exist_ok=True, parents=True)

In [3]:
# Loading the (pre-computed) embeddings 
model_name = "Facenet512"
df = pd.read_csv('/media/bao/t7/la_lib_dataset/results_dbscan/df/keep_representation_Facenet512.csv', converters={f"{model_name}_representation": json.loads}, usecols=["image", f"{model_name}_representation"])

In [4]:
from sklearn.model_selection import KFold 

In [5]:
faceClusteringPipeline = FaceClusteringPipeline(src_folder, faces_folder, df_folder, log_folder)

In [6]:
reference_clusters_path = pathlib.Path("../reference_clusters")

In [9]:
# random_state is set for reproducibility
for n_splits in tqdm([10, 50, 100]):
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True) # we shuffle the data to avoid bias

    for cluster_outlier in ["all", "new", "skip"]:
        precisions = []
        recalls = []
        f1s = []
        times = []

        for idx, (index_baseline, index_1by1) in enumerate(kf.split(df)):

            # if idx == 0:
            #     print(f"Length of the baseline: {len(index_baseline)}")
            #     print(f"Length of the 1by1: {len(index_1by1)}")

            if idx >= 10:
                break

            # print(f"Fold {idx}")
            df_baseline: pd.DataFrame = df.iloc[index_baseline].copy()
            df_1by1: pd.DataFrame = df.iloc[index_1by1].copy()

            # print len
            # print(f"df_baseline: {len(df_baseline)}")
            # print(f"df_1by1: {len(df_1by1)}")

            # t is the time for 1by1 clustering, comparison with the base cluster only 
            # assumes that the base cluster is already computed
            # don't include the outlier clustering time
            res = clustering_1by1(df_baseline, df_1by1, cluster_outlier=cluster_outlier, faceClusteringPipeline=faceClusteringPipeline, 
            df_folder=df_folder,
            model_name=model_name)
            df_res = res[0]
            t = res[1]

            times.append(t)

            total_tp, total_fn, total_fp, df_stats = compare_w_ref(reference_clusters_path, df_res, faces_folder=faces_folder, src_folder=src_folder)

            total_precision = total_tp / (total_tp + total_fp)
            precisions.append(total_precision)

            total_recall = total_tp / (total_tp + total_fn)
            recalls.append(total_recall)

            total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)
            f1s.append(total_f1)

        summary = pd.DataFrame({"precision": precisions, "recall": recalls, "f1": f1s, "time": times})

        # save the summary
        summary.to_csv(f"res/res_1by1_{n_splits}_{cluster_outlier}.csv", index=False)

100%|██████████| 3/3 [1:11:06<00:00, 1422.30s/it]
