In [1]:
import os
import shutil
import torch
import numpy as np
from tqdm import tqdm
from PIL import Image
from facenet_pytorch import InceptionResnetV1, MTCNN
from sklearn.cluster import DBSCAN, HDBSCAN

# Path to dataset
dataset_dir = './group_outliers5/unknown/'


# Load models
mtcnn = MTCNN(image_size=170, margin=0)
model = InceptionResnetV1(pretrained='vggface2').eval()

# Collect image paths
image_paths = [os.path.join(dataset_dir, f) for f in os.listdir(dataset_dir) if f.lower().endswith(('.jpg', '.png', '.jpeg'))]

# Extract embeddings
embeddings = []
valid_image_paths = []

for path in tqdm(image_paths, desc="Extracting embeddings"):
    try:
        img = Image.open(path)
        face = mtcnn(img)
        if face is not None:
            with torch.no_grad():
                emb = model(face.unsqueeze(0)).numpy().flatten()
                embeddings.append(emb)
                valid_image_paths.append(path)
    except Exception as e:
        print(f"Error with {path}: {e}")

embeddings = np.array(embeddings)


Extracting embeddings: 100%|██████████| 829/829 [04:52<00:00,  2.84it/s]


In [17]:
# Normalize embeddings
from sklearn.preprocessing import normalize

embeddings_norm = normalize(embeddings)

clusterer = DBSCAN(
    # eps=0.6,
    # min_samples=4,
    # eps=0.6,
    # min_samples=2,
    
    
    #best
    # eps=0.65,
    # min_samples=2, 
    eps=0.68,
    min_samples=2,        
    metric='euclidean',
)

labels = clusterer.fit_predict(embeddings_norm)

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f"Found {n_clusters} clusters.")

Found 22 clusters.


In [18]:
output_dir = './group_outliers5/groups'
os.makedirs(output_dir, exist_ok=True)

In [19]:
from collections import defaultdict

cluster_map = defaultdict(list)
for path, label in zip(valid_image_paths, labels):
    cluster_map[label].append(path)

# Save grouped images into folders
for label, paths in cluster_map.items():
    folder_name = f"person_{label}" if label != -1 else "unknown"
    cluster_folder = os.path.join(output_dir, folder_name)
    os.makedirs(cluster_folder, exist_ok=True)
    
    for src_path in paths:
        filename = os.path.basename(src_path)
        dst_path = os.path.join(cluster_folder, filename)
        shutil.copy2(src_path, dst_path)