In [1]:
!pip install opencv-python
!pip install numpy
!pip install Pillow



In [None]:
import os
import cv2
import numpy as np
from PIL import Image
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Parameters
dataset_root_folder = 'celeb'     # Main folder containing 10 subfolders
image_size = (50, 50)
max_threads = 16

# Load Haar cascade
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Thread-safe shared lists
embeddings = []
labels = []
lock = threading.Lock()

# Collect all image paths and their corresponding subfolder (label)
image_paths = []
for root, dirs, files in os.walk(dataset_root_folder):
    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            full_path = os.path.join(root, file)
            label = os.path.basename(root)  # use folder name as label
            image_paths.append((full_path, label))

# Function to process one image
def process_image(path_label):
    filepath, label = path_label
    try:
        image = Image.open(filepath).convert('RGB')
        img_np = np.array(image)

        gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)

        for (x, y, w, h) in faces:
            face = img_np[y:y+h, x:x+w]
            face_resized = cv2.resize(face, image_size)
            face_flat = face_resized.flatten().astype(np.float32) / 255.0

            with lock:
                embeddings.append(face_flat)
                labels.append(label)
            break
    except Exception as e:
        print(f"❌ Error processing {filepath}: {e}")

# Process all images using threads
start_time = time.time()

with ThreadPoolExecutor(max_workers=max_threads) as executor:
    futures = [executor.submit(process_image, img) for img in image_paths]
    for _ in as_completed(futures):
        pass  # waiting for completion

# Convert to NumPy arrays
embeddings_np = np.array(embeddings, dtype=np.float32)
labels_np = np.array(labels)

# Save results
np.save('face_embeddings.npy', embeddings_np)
np.save('face_labels.npy', labels_np)

end_time = time.time()

print(f"\n✅ Saved {len(embeddings_np)} face embeddings.")
print(f"⏱️ Embedding time: {end_time - start_time:.2f} seconds")


# Full folder with multi-threading

In [None]:
import os
import cv2
import numpy as np
from PIL import Image
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Parameters
dataset_folder = 'ima'          # ✅ One folder containing all images
image_size = (50, 50)             # Resize dimension
max_threads = 16                  # Tune based on your CPU (8–32 is typical)

# Load Haar cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Thread-safe storage
embeddings = []
labels = []
lock = threading.Lock()

# Function to process a single image file
def process_image(filename):
    if not filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        return

    filepath = os.path.join(dataset_folder, filename)
    try:
        # Load and preprocess image
        image = Image.open(filepath).convert('RGB')
        img_np = np.array(image)
        gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)

        # Detect faces
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)

        for (x, y, w, h) in faces:
            face = img_np[y:y+h, x:x+w]
            face_resized = cv2.resize(face, image_size)
            face_flat = face_resized.flatten().astype(np.float32) / 255.0

            # Add data to shared lists safely
            with lock:
                embeddings.append(face_flat)
                labels.append(filename)
            break  # Only use the first face
    except Exception as e:
        print(f"❌ Error processing {filename}: {e}")

# Start processing
start_time = time.time()
image_files = os.listdir(dataset_folder)

# Run multithreading
with ThreadPoolExecutor(max_workers=max_threads) as executor:
    futures = [executor.submit(process_image, f) for f in image_files]
    for _ in as_completed(futures):
        pass  # Wait for all

# Save outputs
embeddings_np = np.array(embeddings, dtype=np.float32)
labels_np = np.array(labels)

np.save('face_embeddings.npy', embeddings_np)
np.save('face_labels.npy', labels_np)

end_time = time.time()

print(f"\n✅ Saved {len(embeddings_np)} face embeddings.")
print(f"⏱️ Embedding time: {end_time - start_time:.2f} seconds")
