# FRGC Clustering with RetinaFace + ArcFace

## 0. Setup

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import cv2
from insightface.app import FaceAnalysis
from tqdm import tqdm
import pandas as pd
from joblib import Parallel, delayed

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' #use both GPU

## 1. Load FRGC dataset

In [4]:
FRGC_root = '/d1/jferna27/FRGC-nd1/all'
csv_path = '../frgc cluster/frgc_face_embeddings.csv'

if os.path.exists(csv_path): # CSV found, load from it
    print(f"Found {csv_path}, person ids loaded from csv")
    df = pd.read_csv(csv_path) #df.columns: 512d, label,image_file,, person_id
    image_files = df['image_file'].tolist()
    print(f"Total images: {len(image_files)}")
    person_ids = [p[:5] for p in image_files] #person id (1st 5 characters)
    unique_persons = sorted(list(set(person_ids)))
    print(f"Identities: {len(unique_persons)}")
    labels = df['label'].values

else: # CSV not found, use FRGC_root directory scan
    image_files = [f for f in os.listdir(FRGC_root) if f.endswith('.JPG') or f.endswith('.jpg')]
    print(f"Total images: {len(image_files)}")
    person_ids = [img[:5] for img in image_files] #person id (1st 5 characters)
    unique_persons = sorted(list(set(person_ids)))
    print(f"Identities: {len(unique_persons)}")
    person_to_label = {person: idx for idx, person in enumerate(unique_persons)} #mapping from person_id to label index
    labels = np.array([person_to_label[pid] for pid in person_ids])

Found ../frgc cluster/frgc_face_embeddings.csv, person ids loaded from csv
Total images: 39327
Identities: 568


## 2. Initialize Face Detection + Embeddings

## 3. Extract embeddings (parallel processing on both GPUs)

In [5]:
if os.path.exists(csv_path): #use csv if exists
    print("Found frgc_face_embeddings.csv")
    df = pd.read_csv(csv_path)
    drop_cols = ['label', 'image_file', 'person_id']
    emb_cols = [c for c in df.columns if c not in drop_cols]
    embeddings = df[emb_cols].values
    valid_labels = df['label'].values
    valid_image_files = df['image_file'].tolist()

else: #csv does not exist, process images to extract embeddings
    print(f"Processing with GPU 0 and 1")
    def process_batch_worker(gpu_id, batch_data): #function of detect and embed, but for a specific GPU
        app = FaceAnalysis(name="buffalo_l") #arcface
        app.prepare(ctx_id=gpu_id, det_size=(160, 160)) #160,160 the same as LFW, then it could be 620,620 because FRGC has high-res images
        embs, lbls, files = [], [], []
        for img_file, lbl in tqdm(batch_data, desc=f"GPU {gpu_id}"):
            #detect and extract emb
            img = cv2.imread(os.path.join(FRGC_root, img_file))
            if img is not None:
                faces = app.get(img)
                if len(faces) > 0:
                    embs.append(faces[0].embedding)
                    lbls.append(lbl)
                    files.append(img_file)
        return {'embs': embs, 'lbls': lbls, 'files': files}
    mid = len(image_files) // 2 #split data for 2 GPUs and process in parallel
    results = Parallel(n_jobs=2, backend='loky')(
        delayed(process_batch_worker)(gpu_id, list(zip(image_files[start:end], labels[start:end])))
        for gpu_id, (start, end) in [(0, (0, mid)), (1, (mid, len(image_files)))]
    )
    embeddings = np.vstack([np.array(r['embs']) for r in results])
    valid_labels = np.concatenate([np.array(r['lbls']) for r in results])
    valid_image_files = sum([r['files'] for r in results], [])

print(f"Embeddings: {embeddings.shape}")
print(f"Valid images: {len(valid_image_files)}")
print(f"Unique identities: {len(np.unique(valid_labels))}")

Found frgc_face_embeddings.csv
Embeddings: (39327, 512)
Valid images: 39327
Unique identities: 568


In [6]:
if os.path.exists(csv_path):
    print(f"{csv_path} already exists.")
else: #save csv
    embeddings_df = pd.DataFrame(embeddings)
    embeddings_df['label'] = valid_labels
    embeddings_df['image_file'] = valid_image_files
    embeddings_df['person_id'] = [img[:5] for img in valid_image_files]
    embeddings_df.to_csv(csv_path, index=False)
    print(f"Embeddings and labels saved to '{csv_path}'")

../frgc cluster/frgc_face_embeddings.csv already exists.


## 4. Cluster by identity

In [7]:
label_values = np.unique(valid_labels)
counts = {lbl: int((valid_labels == lbl).sum()) for lbl in label_values}

#filter identities >=100 images
min_images_threshold = 100
filtered_labels = [lbl for lbl, count in counts.items() if count >= min_images_threshold]
print(f"Total identities before filter: {len(label_values)}")
print(f"Identities with >= {min_images_threshold} images: {len(filtered_labels)}")
counts_filtered = {lbl: counts[lbl] for lbl in filtered_labels}
min_count = min(counts_filtered.values())
print(f"Per-label counts (examples): {list(counts_filtered.items())[:5]} ...")
print(f"Minimum images per identity: {min_count}")

rng = np.random.RandomState(42)
identity_clusters = {}
balanced_indices = []
for lbl in filtered_labels:
    inds = np.where(valid_labels == lbl)[0]
    if len(inds) > min_count:
        sel = rng.choice(inds, size=min_count, replace=False)
    else:
        sel = inds
    sel = np.sort(sel)  # keep deterministic ordering for inspectability
    balanced_indices.extend(list(sel))
    identity_clusters[lbl] = embeddings[sel]

# Balanced arrays (optional -- downstream code can use these if desired)
balanced_indices = np.array(balanced_indices)
balanced_embeddings = embeddings[balanced_indices]
balanced_labels = valid_labels[balanced_indices]
balanced_image_files = [valid_image_files[i] for i in balanced_indices]
print(f"Balanced embeddings shape: {balanced_embeddings.shape}")

Total identities before filter: 568
Identities with >= 100 images: 142
Per-label counts (examples): [(0, 218), (2, 136), (3, 220), (4, 202), (8, 104)] ...
Minimum images per identity: 100
Balanced embeddings shape: (14200, 512)


In [8]:
# If balanced arrays were created above, replace original variables so downstream cells use balanced data
if 'balanced_embeddings' in globals():
    print('Using balanced data for downstream analysis (same #imgs per identity)')
    embeddings = balanced_embeddings
    valid_labels = balanced_labels
    valid_image_files = balanced_image_files
    # build mapping from numeric label -> person_id using valid_image_files
    label_values = np.unique(valid_labels)
    label_to_person = {}
    for lbl in label_values:
        idxs = np.where(valid_labels == lbl)[0]
        if len(idxs) > 0:
            label_to_person[lbl] = valid_image_files[idxs[0]][:5]
        else:
            label_to_person[lbl] = str(lbl)
    max_label = int(label_values.max())
    label_names = np.array([label_to_person.get(i, str(i)) for i in range(max_label + 1)])
    unique_persons = sorted(list(set(label_names.tolist())))
    print(f"Balanced embeddings: {embeddings.shape}, labels: {valid_labels.shape}")
else:
    print('Balanced data not found â€” using original embeddings')

Using balanced data for downstream analysis (same #imgs per identity)
Balanced embeddings: (14200, 512), labels: (14200,)


In [9]:
label_names = np.array([unique_persons[i] for i in range(len(unique_persons))]) #person ids

cluster_stats = {}
for lbl, cluster in identity_clusters.items():
    centroid = cluster.mean(axis=0)
    distances = np.linalg.norm(cluster - centroid, axis=1)
    cluster_stats[label_names[lbl]] = {
        "centroid": centroid,
        "radius": distances.max(),
        "dispersion": distances.std(),
        "num_images": len(cluster)
    }

In [10]:
print(cluster_stats)

{'02463': {'centroid': array([ 1.37978860e+00, -5.95679604e-01,  8.09980945e-01, -5.66886527e-01,
        7.04767323e-01,  6.46459469e-01,  8.37756903e-03,  7.58724426e-01,
        1.28491995e+00,  6.90437371e-01, -4.21765037e-01,  4.33929647e-01,
       -9.17646021e-01, -1.44498833e-01,  2.59218292e-01,  1.09480198e-02,
        5.44697221e-02, -2.50309256e-02, -2.42074807e-01,  6.14640432e-01,
       -4.81918144e-02, -1.49136385e+00,  1.47779141e+00, -2.06664315e-01,
        3.59670578e-01, -4.59875649e-01,  8.95212871e-01,  1.07054381e+00,
       -1.49649040e+00, -1.19731304e+00, -4.89928899e-01, -2.62395911e-02,
        1.08136169e-01,  6.11988417e-01, -9.80609462e-02, -9.15977119e-01,
       -1.78193648e+00, -2.07809257e-01,  5.37498402e-01,  1.51064684e+00,
       -1.19109142e+00,  1.06034666e+00,  8.03653983e-03, -8.49178867e-01,
        2.43856784e-01,  2.33877214e-01, -1.28907220e-01,  7.23171901e-01,
        6.84966381e-01, -1.40910680e+00, -6.79139522e-01, -3.89318694e-01,
  

## 5. .