In [1]:
import pathlib
import pandas as pd
import json
import cv2
import numpy as np
from tqdm import tqdm
from time import perf_counter
from sklearn.model_selection import KFold 

import sys
sys.path.append("../")

from face_clustering_pipeline import FaceClusteringPipeline
from iterative_clustering.iterative_clustering import split_clustering, merge_clustering
from helper.compare_w_reference import compare_w_ref
from helper.display_cluster import faceId_to_ogId

* Divide the full dataset into <n> subsets
* Cluster each subset using DBSCAN (best f1 parameter)
* Compute the average embedding for each predicted cluster
* Merge the clusters from each subset into a single set of clusters (using the
  average embedding) => should try using either DBSCAN or AHC (average linkage)
* Compute performance metric 

In [2]:
base_path = pathlib.Path("/media/bao/t7/la_lib_dataset")

src_folder = base_path / "img"
faces_folder = base_path / "faces"
save_folder = base_path / "results_iterative_clustering"

df_folder = save_folder / "df_divide_merge"
log_folder = save_folder / "log_divide_merge"

df_folder.mkdir(exist_ok=True, parents=True)
log_folder.mkdir(exist_ok=True, parents=True)

In [3]:
faceClusteringPipeline = FaceClusteringPipeline(src_folder, faces_folder, df_folder, log_folder)

In [4]:
model_name = "Facenet512"

# load the embeddings 
df = pd.read_csv('/media/bao/t7/la_lib_dataset/160k/df/keep_representation_Facenet512.csv', usecols=['image', f"{model_name}_representation"] ,converters={f"{model_name}_representation": json.loads})

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [19]:
for n_splits in tqdm(range(2, 11)):
    cluster_outlier = "all"

    times = []
    ns = []

    for n in range(10_000, 160_001, 10_000):

        if n > len(df):
            n = len(df)
        
        print(f"Number of faces: {n} with split {n_splits}: ", end=" ")
        df_input = df.sample(n=n, random_state=42)

        df_splits = np.array_split(df_input, n_splits)

        total_t = 0
        df_res, t_res = split_clustering(df_splits, faceClusteringPipeline=faceClusteringPipeline, df_folder=df_folder, model_name=model_name)
        total_t = sum(t_res)
        
        if cluster_outlier == "all":
            t0 = perf_counter()

            df_inliers = df_res[df_res["cluster_label"] != -1]
            df_outliers = df_res[df_res["cluster_label"] == -1][["image", f"{model_name}_representation"]]

            df_outliers_res, _ = split_clustering([df_outliers], faceClusteringPipeline=faceClusteringPipeline, df_folder=df_folder, model_name=model_name)
            clustering_res = [df_inliers, df_outliers_res]
            df_res = merge_clustering(clustering_res, faceClusteringPipeline=faceClusteringPipeline, df_folder=df_folder, model_name=model_name)
            t1 = perf_counter()

            total_t += t1-t0
        
        print(f"{total_t:.2f} seconds")

        times.append(total_t)
        ns.append(n)
    
    df_results = pd.DataFrame({"n": ns, "time": times})
    df_results.to_csv(f"time_measurements/dm_{n_splits}.csv", index=False)

  0%|          | 0/9 [00:00<?, ?it/s]

Number of faces: 10000 with split 2:  2.29 seconds
Number of faces: 20000 with split 2:  7.51 seconds
Number of faces: 30000 with split 2:  15.82 seconds
Number of faces: 40000 with split 2:  27.38 seconds
Number of faces: 50000 with split 2:  42.38 seconds
Number of faces: 60000 with split 2:  60.51 seconds
Number of faces: 70000 with split 2:  81.49 seconds
Number of faces: 80000 with split 2:  105.57 seconds
Number of faces: 90000 with split 2:  130.46 seconds
Number of faces: 100000 with split 2:  159.81 seconds
Number of faces: 110000 with split 2:  193.18 seconds
Number of faces: 120000 with split 2:  224.30 seconds
Number of faces: 130000 with split 2:  261.22 seconds
Number of faces: 140000 with split 2:  302.21 seconds
Number of faces: 150000 with split 2:  342.97 seconds
Number of faces: 153019 with split 2:  

 11%|█         | 1/9 [38:40<5:09:23, 2320.45s/it]

363.06 seconds
Number of faces: 10000 with split 3:  1.76 seconds
Number of faces: 20000 with split 3:  6.61 seconds
Number of faces: 30000 with split 3:  14.21 seconds
Number of faces: 40000 with split 3:  25.55 seconds
Number of faces: 50000 with split 3:  38.91 seconds
Number of faces: 60000 with split 3:  54.81 seconds
Number of faces: 70000 with split 3:  73.71 seconds
Number of faces: 80000 with split 3:  96.27 seconds
Number of faces: 90000 with split 3:  120.77 seconds
Number of faces: 100000 with split 3:  147.84 seconds
Number of faces: 110000 with split 3:  176.01 seconds
Number of faces: 120000 with split 3:  209.56 seconds
Number of faces: 130000 with split 3:  242.96 seconds
Number of faces: 140000 with split 3:  281.12 seconds
Number of faces: 150000 with split 3:  317.59 seconds
Number of faces: 153019 with split 3:  

 22%|██▏       | 2/9 [1:14:23<4:18:32, 2216.09s/it]

335.09 seconds
Number of faces: 10000 with split 4:  1.71 seconds
Number of faces: 20000 with split 4:  6.38 seconds
Number of faces: 30000 with split 4:  13.49 seconds
Number of faces: 40000 with split 4:  23.28 seconds
Number of faces: 50000 with split 4:  37.04 seconds
Number of faces: 60000 with split 4:  52.48 seconds
Number of faces: 70000 with split 4:  70.52 seconds
Number of faces: 80000 with split 4:  91.20 seconds
Number of faces: 90000 with split 4:  114.60 seconds
Number of faces: 100000 with split 4:  141.25 seconds
Number of faces: 110000 with split 4:  171.48 seconds
Number of faces: 120000 with split 4:  199.48 seconds
Number of faces: 130000 with split 4:  233.24 seconds
Number of faces: 140000 with split 4:  272.84 seconds
Number of faces: 150000 with split 4:  320.65 seconds
Number of faces: 153019 with split 4:  

 33%|███▎      | 3/9 [1:49:10<3:35:42, 2157.02s/it]

336.81 seconds
Number of faces: 10000 with split 5:  1.67 seconds
Number of faces: 20000 with split 5:  6.44 seconds
Number of faces: 30000 with split 5:  13.95 seconds
Number of faces: 40000 with split 5:  23.53 seconds
Number of faces: 50000 with split 5:  35.92 seconds
Number of faces: 60000 with split 5:  53.87 seconds
Number of faces: 70000 with split 5:  71.58 seconds
Number of faces: 80000 with split 5:  92.72 seconds
Number of faces: 90000 with split 5:  117.41 seconds
Number of faces: 100000 with split 5:  142.91 seconds
Number of faces: 110000 with split 5:  172.58 seconds
Number of faces: 120000 with split 5:  206.36 seconds
Number of faces: 130000 with split 5:  236.05 seconds
Number of faces: 140000 with split 5:  269.42 seconds
Number of faces: 150000 with split 5:  307.74 seconds
Number of faces: 153019 with split 5:  

 44%|████▍     | 4/9 [2:23:41<2:56:56, 2123.30s/it]

319.14 seconds
Number of faces: 10000 with split 6:  1.64 seconds
Number of faces: 20000 with split 6:  6.05 seconds
Number of faces: 30000 with split 6:  13.00 seconds
Number of faces: 40000 with split 6:  22.39 seconds
Number of faces: 50000 with split 6:  34.20 seconds
Number of faces: 60000 with split 6:  48.79 seconds
Number of faces: 70000 with split 6:  67.54 seconds
Number of faces: 80000 with split 6:  88.51 seconds
Number of faces: 90000 with split 6:  110.83 seconds
Number of faces: 100000 with split 6:  136.54 seconds
Number of faces: 110000 with split 6:  171.85 seconds
Number of faces: 120000 with split 6:  201.51 seconds
Number of faces: 130000 with split 6:  240.35 seconds
Number of faces: 140000 with split 6:  271.20 seconds
Number of faces: 150000 with split 6:  315.23 seconds
Number of faces: 153019 with split 6:  

 56%|█████▌    | 5/9 [2:57:56<2:19:53, 2098.38s/it]

324.26 seconds
Number of faces: 10000 with split 7:  1.69 seconds
Number of faces: 20000 with split 7:  6.24 seconds
Number of faces: 30000 with split 7:  13.12 seconds
Number of faces: 40000 with split 7:  22.72 seconds
Number of faces: 50000 with split 7:  35.16 seconds
Number of faces: 60000 with split 7:  49.81 seconds
Number of faces: 70000 with split 7:  66.95 seconds
Number of faces: 80000 with split 7:  87.10 seconds
Number of faces: 90000 with split 7:  112.87 seconds
Number of faces: 100000 with split 7:  138.40 seconds
Number of faces: 110000 with split 7:  168.89 seconds
Number of faces: 120000 with split 7:  198.40 seconds
Number of faces: 130000 with split 7:  230.80 seconds
Number of faces: 140000 with split 7:  270.16 seconds
Number of faces: 150000 with split 7:  304.20 seconds
Number of faces: 153019 with split 7:  

 67%|██████▋   | 6/9 [3:31:36<1:43:35, 2071.86s/it]

313.58 seconds
Number of faces: 10000 with split 8:  1.62 seconds
Number of faces: 20000 with split 8:  5.97 seconds
Number of faces: 30000 with split 8:  12.66 seconds
Number of faces: 40000 with split 8:  21.98 seconds
Number of faces: 50000 with split 8:  33.52 seconds
Number of faces: 60000 with split 8:  47.91 seconds
Number of faces: 70000 with split 8:  64.36 seconds
Number of faces: 80000 with split 8:  84.29 seconds
Number of faces: 90000 with split 8:  106.50 seconds
Number of faces: 100000 with split 8:  134.75 seconds
Number of faces: 110000 with split 8:  161.55 seconds
Number of faces: 120000 with split 8:  193.24 seconds
Number of faces: 130000 with split 8:  235.63 seconds
Number of faces: 140000 with split 8:  276.84 seconds
Number of faces: 150000 with split 8:  312.92 seconds
Number of faces: 153019 with split 8:  

 78%|███████▊  | 7/9 [4:05:11<1:08:26, 2053.36s/it]

321.22 seconds
Number of faces: 10000 with split 9:  1.61 seconds
Number of faces: 20000 with split 9:  5.96 seconds
Number of faces: 30000 with split 9:  12.50 seconds
Number of faces: 40000 with split 9:  21.60 seconds
Number of faces: 50000 with split 9:  33.53 seconds
Number of faces: 60000 with split 9:  47.98 seconds
Number of faces: 70000 with split 9:  63.53 seconds
Number of faces: 80000 with split 9:  83.80 seconds
Number of faces: 90000 with split 9:  106.28 seconds
Number of faces: 100000 with split 9:  132.53 seconds
Number of faces: 110000 with split 9:  162.20 seconds
Number of faces: 120000 with split 9:  201.01 seconds
Number of faces: 130000 with split 9:  231.93 seconds
Number of faces: 140000 with split 9:  279.84 seconds
Number of faces: 150000 with split 9:  319.95 seconds
Number of faces: 153019 with split 9:  

 89%|████████▉ | 8/9 [4:39:03<34:06, 2046.62s/it]  

327.66 seconds
Number of faces: 10000 with split 10:  1.67 seconds
Number of faces: 20000 with split 10:  6.04 seconds
Number of faces: 30000 with split 10:  12.89 seconds
Number of faces: 40000 with split 10:  22.27 seconds
Number of faces: 50000 with split 10:  33.84 seconds
Number of faces: 60000 with split 10:  49.59 seconds
Number of faces: 70000 with split 10:  65.50 seconds
Number of faces: 80000 with split 10:  86.50 seconds
Number of faces: 90000 with split 10:  109.15 seconds
Number of faces: 100000 with split 10:  137.48 seconds
Number of faces: 110000 with split 10:  162.71 seconds
Number of faces: 120000 with split 10:  198.08 seconds
Number of faces: 130000 with split 10:  230.84 seconds
Number of faces: 140000 with split 10:  270.70 seconds
Number of faces: 150000 with split 10:  299.43 seconds
Number of faces: 153019 with split 10:  

100%|██████████| 9/9 [5:12:27<00:00, 2083.06s/it]

316.67 seconds





Baseline time for the "40k" dataset full clustering: 15s

* 2022-11-05 17:23:31,971 Clustering faces
* 2022-11-05 17:23:31,972   **33747** faces to cluster
* 2022-11-05 17:23:33,161 algorithm: DBSCAN
* 2022-11-05 17:23:33,161 min_samples: 2
* 2022-11-05 17:23:33,161 distance_metric: euclidean
* 2022-11-05 17:23:33,161 threshold: 23.06
* 2022-11-05 17:23:48,844 Clustering faces took **15.682254252002167** seconds
* 2022-11-05 17:23:48,846 saving under /media/bao/t7/la_lib_dataset/results_dbscan/df/cluster_Facenet512_DBSCAN_euclidean_2_23.06.csv
* 2022-11-05 17:23:49,017  Found 2 clusters

A very low precision, indicate that some reference cluster were not detected and
thus the number of false positive is high => making the precision very low