In [4]:
import os
import random
import shutil
from collections import defaultdict

# Path to the UTKFace dataset
dataset_path = r"D:\project\Dataset\UTK"
# Path to save the selected images
output_path = r"D:\project\Dataset\Selected_UTK"

# Create output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Dictionary to hold images categorized by (age, gender, race)
image_dict = defaultdict(list)

# Load images and categorize them
for filename in os.listdir(dataset_path):
    if filename.endswith('.jpg'):
        # Extract age, gender, and race from the filename
        parts = filename.split('_')
        
        # Check if we have enough parts to unpack
        if len(parts) < 4:
            print(f"Skipping file due to unexpected format: {filename}")
            continue
        
        try:
            age = int(parts[0])
            gender = int(parts[1])  # 0 for male, 1 for female
            race = int(parts[2])     # 0-4 for different races

            # Append the image path to the appropriate category
            image_dict[(age, gender, race)].append(filename)
        except ValueError as e:
            print(f"Skipping file due to error: {filename} - {e}")

# Total number of desired images
total_images = 5000

# List to hold selected images
selected_images = []

# Calculate how many images to sample from each category
for key, images in image_dict.items():
    if len(images) > 0:
        selected_images.extend(images)  # Add all images from this category

# If we have more than 5000 images, randomly select 5000
if len(selected_images) > total_images:
    selected_images = random.sample(selected_images, total_images)
elif len(selected_images) < total_images:
    # If we have less than 5000, we need to fill up by sampling from all available images again
    remaining_needed = total_images - len(selected_images)
    all_available_images = [img for sublist in image_dict.values() for img in sublist]
    
    if len(all_available_images) >= remaining_needed:
        selected_images.extend(random.sample(all_available_images, remaining_needed))
    else:
        selected_images.extend(all_available_images)

# Shuffle the selected images to ensure randomness
random.shuffle(selected_images)

# If we still have more than 5000 images, slice the list
selected_images = selected_images[:total_images]

# Copy selected images to the output directory
for img in selected_images:
    src_path = os.path.join(dataset_path, img)
    dst_path = os.path.join(output_path, img)
    shutil.copy(src_path, dst_path)

print(f"Selected {len(selected_images)} images and saved them to {output_path}.")


Skipping file due to unexpected format: 39_1_20170116174525125.jpg
Skipping file due to error: 53__0_20170116184028385.jpg - invalid literal for int() with base 10: ''
Skipping file due to unexpected format: 61_1_20170109142408075.jpg
Skipping file due to unexpected format: 61_3_20170109150557335.jpg
Selected 5000 images and saved them to D:\project\Dataset\Selected_UTK.
