In [None]:
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm
from helper.la_lib_archive import LaLibAPI
from helper.display_cluster import show_cluster, split_ids, locate_and_plot_image, plot_overview_cluster
from deepface.commons import distance as dst
from helper.avg_embed import get_avg_embed
# from ast import literal_eval
import json

# load cluster results

In [None]:
base_path = pathlib.Path("/media/bao/t7/la_lib_dataset")


src_folder = base_path / "img"
faces_folder = base_path / "faces"
df_folder = base_path / "160k" /"res"

model_name="Facenet512"

# ==================== DBSCAN ====================
clustering_algo = "DBSCAN"
min_samples = 3
threshold = 0.1925
cluster_path = df_folder / f"cluster_{model_name}_{clustering_algo}_cosine_{min_samples}_{threshold}.csv"

# ==================== AHC ====================
# clustering_algo = "AHC"
# linkage = "average"
# threshold = 0.3
# cluster_path = df_folder /
# f"cluster_{model_name}_{clustering_algo}_cosine_{linkage}_{threshold}.csv"

# ==================== CUSTOM ====================
# cluster_path = pathlib.Path("/media/bao/t7/la_lib_dataset/results_dbscan_ahc/df/cluster_Facenet512_AHC_cosine_average_0.31.csv")

df = pd.read_csv(cluster_path, usecols=["image", "cluster_label"])

# convert image to face_id 
df["face_id"] = df["image"].apply(pathlib.Path).apply(lambda x: x.stem)

df["cluster_label"].value_counts()

In [None]:
model_name = "Facenet512"

# load the embeddings 
df_embeddings = pd.read_csv(base_path / "160k" /"df"/"keep_representation_Facenet512.csv", index_col=0, converters={f"{model_name}_representation": json.loads})

# left merge df and df_embeddings on image
df = df.merge(df_embeddings[["image", "Facenet512_representation"]], on="image", how="left")

df_avg_embed_512 = get_avg_embed(df, "Facenet512", "cluster_label")

df = df.merge(df_avg_embed_512, left_on="cluster_label", right_on="cluster_label", how='left')

def calc_dst_to_mean(row):
    # check if the row is outlier
    if row["cluster_label"] == -1:
        return np.NaN
    return dst.findCosineDistance(row["avg_Facenet512_representation"], row["Facenet512_representation"])

df["dst_to_mean"] = df.apply(calc_dst_to_mean, axis=1)

# sort by dst_to_mean
df = df.sort_values(by="dst_to_mean", ascending=True)

# title = face_id + dst_to_mean (4 decimal places)
df["title"] = df["face_id"] + " \n " + df["dst_to_mean"].apply(lambda x: f"{x:.4f}")

# Distribution

In [None]:
df["cluster_label"].nunique()

In [None]:
# plot with log scale
fig, ax = plt.subplots(figsize=(25, 5))
df["cluster_label"].value_counts().plot(kind="bar", logy=True, ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
# arr with red, red then only blue (use the default blue color from sns)
color = ["#1F77B4"] * 50 + ["red"] * 2
df["cluster_label"].value_counts().value_counts().sort_index(ascending=True).plot(kind="bar", logy=True, ax=ax, color=color)

# xlabel = number of faces
ax.set_xlabel("Number of faces")
# ylabel = number of identities
ax.set_ylabel("Number of identities (log scale)")

In [None]:
# get the number of outliers
df["cluster_label"].value_counts().loc[-1], df["cluster_label"].value_counts().loc[2]

# Overview

In [None]:
plot_overview_cluster(df, faces_folder, offset=2, nrow=25, ncol=31)

In [None]:
# etchebest
# ['1LDNdHxXKwOBaSrUcwBLF5',
#  '8B2JIIct4sOABC1Qyc_ov4',
#  '10vEpFSAKiv8JZImfyga_p',
#  '0rP6rTXqqks9_UDLZIIrlS',
#  '6uNZ74P2qNe8T5atZ_aR9l']
id = "1LDNdHxXKwOBaSrUcwBLF5"
ids, _ = locate_and_plot_image(df=df, image_id=id, faces_folder=faces_folder, originals_folder=src_folder, limit=100, ncol=4, show_original=True, plot=True, save_folder = None, hide_axis=False, title_col = None, marked = [])

In [None]:
# raph
# ['7E6yi9brajIBdv-EUFajQ4',
#  '1hTWU2no4l7BjaUR2HHMGS',
#  '1kE6DJ0KqZ89E9CzI5XOFC',
#  '7mxK31KZaQ2BwfIdC7VFUz',
#  'E2VNmfrE4519blrhQrH20J']
# id = "7E6yi9brajIBdv-EUFajQ4"
# ids, _ = locate_and_plot_image(df=df, image_id=id, faces_folder=faces_folder, originals_folder=src_folder, limit=100, ncol=4, show_original=True, plot=True, save_folder = None, hide_axis=False, title_col = None, marked = [])

In [None]:
# Mathilde gremaud 
# 'C45OEyke4AHAXbg7-foqSK',
#  '7_1iQDuWKit9FVl79WgarE',
#  '0A5OSVj4aosB2W77fVoO7k',
#  '7dV3qjs54ct8j_Vd2N2bsC',
#  'AzPPBqFhq9Y9KnSTdZdwWB'
id = "0A5OSVj4aosB2W77fVoO7k"
ids, _ = locate_and_plot_image(df=df, image_id=id, faces_folder=faces_folder, originals_folder=src_folder, limit=100, ncol=5, show_original=True, plot=True, save_folder = None, hide_axis=False, title_col = None, marked = [])

In [None]:
# The Queen 1wBlltV8K9m8vFaKMe2wF6
id = "1wBlltV8K9m8vFaKMe2wF6"
ids, _ = locate_and_plot_image(df=df, image_id=id, faces_folder=faces_folder, originals_folder=src_folder, limit=100, ncol=10, show_original=True, plot=True, save_folder = None, hide_axis=False, title_col = None, marked = [])

In [None]:
id = "1Dhs2RWTK5eAjjUBHiw_m9"
ids = locate_and_plot_image(df=df, image_id=id, faces_folder=faces_folder, originals_folder=src_folder, limit=100, ncol=10, show_original=True, plot=True, save_folder = None, hide_axis=False, title_col = "title", marked = [])

In [None]:
id = "5Eegciggq6vAXmszqJFueP"
ids = locate_and_plot_image(df=df, image_id=id, faces_folder=faces_folder, originals_folder=src_folder, limit=100, ncol=5, show_original=True, plot=True, save_folder = None, hide_axis=False, title_col = "title", marked = [])

In [None]:
# The Queen 1wBlltV8K9m8vFaKMe2wF6
id = "1wBlltV8K9m8vFaKMe2wF6"
ids = locate_and_plot_image(df=df, image_id=id, faces_folder=faces_folder, originals_folder=src_folder, limit=100, ncol=5, show_original=True, plot=True, save_folder = None, hide_axis=False, title_col = "title", marked = [])

# Performance

In [None]:
from helper.compare_w_reference import compare_w_ref

reference_clusters_path = pathlib.Path("reference_clusters")

In [None]:
total_tp, total_fn, total_fp, df_stats = compare_w_ref(reference_clusters_path, df, faces_folder=faces_folder, src_folder=src_folder)

In [None]:
total_precision = total_tp / (total_tp + total_fp)
total_recall = total_tp / (total_tp + total_fn)
total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)

print("Total precision:", total_precision)
print("Total recall:", total_recall)
print("Total f1:", total_f1)

# Specific person

* 188 Megan
* 253 Biden

In [None]:
def get_cluster_ids(df, face_id):
    cluster = df[df["face_id"] == face_id]["cluster_label"].values[0]
    return cluster

def get_TP_FN_FP(ids: list[str], df: pd.DataFrame, clusterColName: str = "cluster_label", plot=False):
    """
    Compute the true positive and false negative based on a list of ids

    Args:
        ids (list[str]): list of ids
        df (pd.DataFrame): dataframe containing the cluster labels
    
    Returns:
        TP (int): number of true positive (how many did we get right)
        FN (int): number of false negative (how many did we miss)
    """
    df_ids = pd.DataFrame(ids, columns=["image_id"])
    df_ids["cluster_id"] = df_ids["image_id"].apply(lambda x: get_cluster_ids(df, x))
    val_counts = df_ids["cluster_id"].value_counts().reset_index().rename(columns={"index": "cluster_id", "cluster_id": "count"})
    tp = val_counts.iloc[0]["count"]
    fn = val_counts.iloc[1:]["count"].sum()

    dominant_cluster = val_counts.iloc[0]["cluster_id"]
    ids_dom_clust, _ = show_cluster(df=df, cluster_id=dominant_cluster, faces_folder=faces_folder, originals_folder=src_folder, limit=50, ncol=10, show_original=False, plot=False, save_folder = None, hide_axis=False, title_col = None, marked = [])

    # count the diff between ids_dom_clust and ids
    diff = list(set(ids_dom_clust) - set(ids))
    fp = len(diff)

    if plot:
        print("cluster", dominant_cluster)
        _ = show_cluster(df=df, cluster_id=dominant_cluster, faces_folder=faces_folder, originals_folder=src_folder, limit=50, ncol=10, show_original=True, plot=True, save_folder = None, hide_axis=False, title_col = None, marked = diff)

    return dominant_cluster, tp, fn, fp

In [None]:
# The Queen
ids = ['8UaM_ACpKHAB03xo4HLtAz','0T5O55a8aT29CWG_vLgdWD','58rS6m4PKRO9-7H_W66Kai','1Yed50SQ4Cd88gnL2kfWIk','5J9TDNEPa5jBJ4y2AKuxXE','4DbUxAs-qLG8gI7tISDUUm','2wrB3MShaynBKKFuMS9-6D','1ySM0jfsKFyBiaIpNEpMVs','67yu8PB3aXO8-MqjFhZDX5','BsI241yT4BJAvbec3l63AI','BSJaXCUFqs49Jl0GbUV_OX','6xsHXGHRqja8bwaufOkw3K','0v3yazdT4neAZOvif2D_8V','1wBlltV8K9m8vFaKMe2wF6','F1HZ21twaIu9vx7INUAbOd','DEh7VoZsKt18gsrBsFp7OO','8nWtpS_N4rYBMCPXM0gSXO','9hyY69Wmq5s8HBzcmbiL_P','2GwMqYpkaUp98fMjgnZRz6','2o-vhuOLqlMBzkz-gpytio','DVCMSPVK4I_8wggsHRU1xW','DVdQ8FNh4sh9CHDNAiRbNu','8MF_M3d3KmRAaVFTcoIFF2','B_2LZZFMaQc9RmFuwy9NLl','BJWQFoHtaWo8hD8RFf7FbS','AzHDkoKw4B0BReWZDk6wzW','5-2Aq0v8q_18ap5sGFGkm8','8-d9ptMh4VSAQLr7fzX4OG','0hp2cOaCK8MBUJcIe8gK5l','3tlZ9p0iaQH9tl19Q6mywh','AxXoIkMkqtXAJl-x7CMBAR','F2Rb2yV8qMb9MnJ0l4KeMy','C1GZBMIgKrq8WFHbJVyKA-','3NsKchI14qw9DaEtmCRBxo','6e1A7iCOqGF8LCOV2QoLzz']

tp, fn, fp = get_TP_FN_FP(ids, df, plot=True)
print(f"TP: {tp}, FN: {fn}, FP: {fp}")

In [None]:
# Charles
ids = ['1Dhs2RWTK5eAjjUBHiw_m9','DADtOLGE4g9ATIQtb4VEcc','E9znDWxcqYW8O3CsmEHfUm','67fXS5CB4Gz82ry_GfH9fn','AGSn8sbqaaw8DsvIxNIYy6','79UF_l7MqvUA-fQrjhTqpY','DjPANPL1qhPBaO142kUwQ2']

tp, fn, fp = get_TP_FN_FP(ids, df, plot=True)
print(f"TP: {tp}, FN: {fn}, FP: {fp}")

In [None]:
ids = ['EQpX3n-GKwu9mJdnQKcMAg','5Eegciggq6vAXmszqJFueP','C6K2L2ZRq4o9QqZmW5VZLa','2tGZUpFQ4Nn9a7GqBw9ZJy','8EIfV-zcqyo9dTL80TAL1u','4T0MgkOJKrj8BDC1sJca9G','52rLDZ9K4c9BYk16qcNLZ0','DDnefS81aatAuxcK6Edavj','F0IiobKYKa9A4NEpck4yLN']

tp, fn, fp = get_TP_FN_FP(ids, df, plot=True)
print(f"TP: {tp}, FN: {fn}, FP: {fp}")

In [None]:
# Putin
ids = ['ESz70tHEqTB9jzmv8yCCTB','AnEfZF9WqDWBW5KXSQARay','BQLjZfluaIL93R7lI5eum7','BEYGfNoXK0CBlfMDoXIL9w','4dC9y5vXK0K9o-2OB5PtA7','Cj-HOLo_aiM868dILXfmOP','6qbvODdNKAj880Y1JVQHbg','6qbvODdNKAj880Y1JVQHbg','AIYQyvhhaQ7ACS0lEeh0hq']

tp, fn, fp = get_TP_FN_FP(ids, df, plot=True)
print(f"TP: {tp}, FN: {fn}, FP: {fp}")

# Performance in the references clusters

## description of the references set

In [None]:
reference_clusters_path = pathlib.Path("reference_clusters")

In [None]:
# number of unique images 
# 1 image may contain multiple person
df_stats = pd.DataFrame(columns=["cluster_ref_id", "n_images", "dominant_cluster",  "tp", "fn", "fp", "precision", "recall", "f1"])

for file in reference_clusters_path.glob("*.json"):
    with open(file, 'r') as f:
        ids = json.load(f)

    # performance stats
    dominant_cluster, tp, fn, fp = get_TP_FN_FP(ids, df, plot=False)

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)

    # general stats
    cluster_id = file.stem.split("_")[-1]
    length = len(ids)
    df_stats = pd.concat([df_stats, pd.DataFrame([[cluster_id, length, dominant_cluster, tp, fn, fp, precision, recall, f1]], columns=["cluster_ref_id", "n_images", "dominant_cluster", "tp", "fn", "fp", "precision", "recall", "f1"])])
    # break

print("Number of identities:", len(list(reference_clusters_path.glob("*"))))
print("Number of faces:", df_stats["n_images"].sum())

In [None]:
total_tp = df_stats["tp"].sum()
total_fn = df_stats["fn"].sum()
total_fp = df_stats["fp"].sum()

total_precision = total_tp / (total_tp + total_fp)
total_recall = total_tp / (total_tp + total_fn)
total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)

print(f"Total TP: {total_tp}, Total FN: {total_fn}, Total FP: {total_fp}")
print(f"Total precision: {total_precision}, Total recall: {total_recall}, Total F1: {total_f1}")

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
df_stats.sort_values(by="n_images", ascending=False).plot(x="cluster_ref_id", y="n_images", kind="bar", ax=ax)

# Mark false positives

In [None]:
id = 253

ids = show_cluster(df=df, cluster_id=id, faces_folder=faces_folder, originals_folder=src_folder, limit=50, ncol=5, show_original=True, plot=False, save_folder = None, hide_axis=False, title_col = None, marked = [])

index_to_mark = []
marked, unmarked = split_ids(ids, index_to_mark)

ids = show_cluster(df=df, cluster_id=id, faces_folder=faces_folder, originals_folder=src_folder, limit=50, ncol=5, show_original=True, plot=True, save_folder = None, hide_axis=False, title_col = None, marked = marked)

# api = LaLibAPI(df_folder="") 
# api.set_metadatas(unmarked, "otherConditions", "Queen Elizabeth II")

In [None]:
# save unmarked 
import json

with open(f"reference_clusters/cluster_{id}.json", "w") as f:
    json.dump(unmarked, f)

In [None]:
# read cluster_1.json
# with open(f"reference_clusters/cluster_{id}.json", "r") as f:
#     unmarked = json.load(f)

# unmarked

# Searching for the person identity

In [None]:
from ast import literal_eval

dfs = []
for idx in range(1,17):
    df_tmp = pd.read_csv(f"/media/bao/t7/la_lib_dataset/df/df{idx}.csv", converters={"metadata": literal_eval})
    dfs.append(df_tmp)

df_metadata = pd.concat(dfs, ignore_index=True)

In [None]:
cluster_id = 774
ids, _ = show_cluster(df=df, cluster_id=cluster_id, faces_folder=faces_folder, originals_folder=src_folder, limit=100, ncol=5, show_original=True, plot=False, save_folder = None, hide_axis=False, title_col = None, marked = [])
original_filenames = [df_metadata[df_metadata["id"] == id]["metadata"].values[0].get("filename") for id in ids]
df_current_cluster = df[df["cluster_label"] == cluster_id].copy()
df_current_cluster["image_id"] = ids
df_current_cluster["original_filename"] = original_filenames
df_current_cluster["title"] = df_current_cluster["original_filename"] + "\n" + df_current_cluster["dst_to_mean"].apply(lambda x: f"{x:.4f}")
index_to_mark = []
marked, unmarked = split_ids(ids, index_to_mark)
ids, _ = show_cluster(df=df_current_cluster, cluster_id=cluster_id, faces_folder=faces_folder, originals_folder=src_folder, limit=100, ncol=10, show_original=False, plot=True, save_folder = None, hide_axis=False, title_col = "title", marked = marked)

In [None]:
marked, unmarked = split_ids(ids, index_to_mark)

In [None]:
# api = LaLibAPI(df_folder="") 
# api.set_metadatas(unmarked, "otherConditions", "Overney Mireille")