In [1]:
import pathlib
import pandas as pd
import json
import cv2
import numpy as np
from tqdm import tqdm
from time import perf_counter
from sklearn.model_selection import KFold 
from ast import literal_eval

import sys
sys.path.append("../")

from face_clustering_pipeline import FaceClusteringPipeline
from iterative_clustering.iterative_clustering import split_clustering, merge_clustering
from helper.compare_w_reference import compare_w_ref
from helper.display_cluster import faceId_to_ogId

* Divide the full dataset into <n> subsets
* Cluster each subset using DBSCAN (best f1 parameter)
* Compute the average embedding for each predicted cluster
* Merge the clusters from each subset into a single set of clusters (using the
  average embedding) => should try using either DBSCAN or AHC (average linkage)
* Compute performance metric 

In [2]:
base_path = pathlib.Path("/media/bao/t7/la_lib_dataset")

src_folder = base_path / "img"
faces_folder = base_path / "faces"
save_folder = base_path / "results_iterative_clustering"

df_folder = save_folder / "df_divide_merge"
log_folder = save_folder / "log_divide_merge"

df_folder.mkdir(exist_ok=True, parents=True)
log_folder.mkdir(exist_ok=True, parents=True)

In [3]:
faceClusteringPipeline = FaceClusteringPipeline(src_folder, faces_folder, df_folder, log_folder)

In [4]:
# Loading the (pre-computed) embeddings 
model_name = "Facenet512"
df = pd.read_csv(base_path / 'results_dbscan/df/keep_representation_Facenet512.csv', converters={f"{model_name}_representation": json.loads}, usecols=["image", f"{model_name}_representation"])

df["face_id"] = df["image"].apply(pathlib.Path).apply(lambda x: x.stem)
df["id"] = df["face_id"].apply(faceId_to_ogId)

In [5]:
# Loading the metadata

df_metadata = pd.DataFrame()

for idx in range(1, 5):
    metadata = pd.read_csv(f"/media/bao/t7/la_lib_dataset/df_w_metadata/df{idx}.csv", converters={"metadata": literal_eval})
    df_metadata = pd.concat([df_metadata, metadata], ignore_index=True)

df_metadata['assetCreated'] = df_metadata['metadata'].apply(lambda x: x.get('assetCreated').get('value'))

# left merge df and df_metadata
df = df.merge(df_metadata[['id', 'assetCreated']], on="id", how="left")

In [6]:
df['assetCreated'].isnull().unique()

array([False])

In [8]:
# sort df by assetCreated
df = df.sort_values(by="assetCreated")

for n_splits in tqdm(range(2, 11)):
    for cluster_outlier in ["all", "skip"]: # ["all", "skip"]
            precisions = []
            recalls = []
            f1s = []
            times = []

            df_splits = np.array_split(df, n_splits)
            
            total_t = 0
            df_res, t_res = split_clustering(df_splits, faceClusteringPipeline=faceClusteringPipeline, df_folder=df_folder, model_name=model_name)
            total_t = sum(t_res)
            
            if cluster_outlier == "all":
                t0 = perf_counter()

                df_inliers = df_res[df_res["cluster_label"] != -1]
                df_outliers = df_res[df_res["cluster_label"] == -1][["image", f"{model_name}_representation"]]

                df_outliers_res, _ = split_clustering([df_outliers], faceClusteringPipeline=faceClusteringPipeline, df_folder=df_folder, model_name=model_name)
                clustering_res = [df_inliers, df_outliers_res]
                df_res = merge_clustering(clustering_res, faceClusteringPipeline=faceClusteringPipeline, df_folder=df_folder, model_name=model_name)
                t1 = perf_counter()

                total_t += t1-t0

            df_res["face_id"] = df_res["image"].apply(pathlib.Path).apply(lambda x: x.stem)

            reference_clusters_path = pathlib.Path("../reference_clusters")

            total_tp, total_fn, total_fp, df_stats = compare_w_ref(reference_clusters_path, df_res, faces_folder=faces_folder, src_folder=src_folder)

            total_precision = total_tp / (total_tp + total_fp)
            total_recall = total_tp / (total_tp + total_fn)
            total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)

            precisions.append(total_precision)
            recalls.append(total_recall)
            f1s.append(total_f1)
            times.append(total_t)

            df_results = pd.DataFrame({
                "precision": precisions,
                "recall": recalls,
                "f1": f1s,
                "time": times
            })

            df_results.to_csv(f"res/res_dmdate_{n_splits}_{cluster_outlier}.csv", index=False)

100%|██████████| 9/9 [08:56<00:00, 59.63s/it]
