In [1]:
import os
from PIL import Image, UnidentifiedImageError
import hashlib

def dhash(image, hash_size=8):
    """
    Compute 'dhash' of the image for comparison.
    """
    resized = image.convert('L').resize((hash_size + 1, hash_size))
    pixels = list(resized.getdata())
    difference = []
    for row in range(hash_size):
        for col in range(hash_size):
            idx = row * (hash_size + 1) + col
            difference.append(pixels[idx] > pixels[idx + 1])
    return ''.join(['1' if d else '0' for d in difference])

def find_duplicates_in_directory(directory_path):
    """
    Find and remove 50% of duplicate images in a directory.
    """
    image_hashes = {}
    duplicate_count = {}  # to keep track of the number of duplicates for each hash

    for filename in os.listdir(directory_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            filepath = os.path.join(directory_path, filename)
            with Image.open(filepath) as img:
                image_hash = dhash(img)

                if image_hash in image_hashes:
                    # Update duplicate count
                    duplicate_count[image_hash] = duplicate_count.get(image_hash, 0) + 1

                    # If it's an even number, we remove the image, thus keeping only 50% of duplicates
                    if duplicate_count[image_hash] % 2 == 0:
                        print(f"Duplicate found: {filename} - Removing!")
                        os.remove(filepath)
                else:
                    image_hashes[image_hash] = filepath
                    duplicate_count[image_hash] = 1
    print("Processing complete for:", directory_path)

def balance_classes(base_directory_path):
    """
    Balance number of images in each class to match the class with the least number.
    """
    class_counts = {}
    for class_folder in os.listdir(base_directory_path):
        class_folder_path = os.path.join(base_directory_path, class_folder)
        if os.path.isdir(class_folder_path):
            class_counts[class_folder] = sum([1 for f in os.listdir(class_folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))])

    # Getting the minimum class count
    min_class_count = min(class_counts.values())

    # Balancing classes
    for class_folder, count in class_counts.items():
        if count > min_class_count:
            class_folder_path = os.path.join(base_directory_path, class_folder)
            excess_files = [f for f in os.listdir(class_folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))][min_class_count:]
            for ef in excess_files:
                print(f"Removing {ef} from {class_folder} for balancing.")
                os.remove(os.path.join(class_folder_path, ef))

# Starting directory path
base_directory_path = r"sugi\Frame sugi"

# Iterating over each class/folder in the base directory for removing duplicates
for class_folder in os.listdir(base_directory_path):
    class_folder_path = os.path.join(base_directory_path, class_folder)
    if os.path.isdir(class_folder_path):
        find_duplicates_in_directory(class_folder_path)

# Balancing the classes
balance_classes(base_directory_path)


Duplicate found: sugi_136695.jpg - Removing!
Duplicate found: sugi_136720.jpg - Removing!
Duplicate found: sugi_136734.jpg - Removing!
Duplicate found: sugi_136760.jpg - Removing!
Duplicate found: sugi_136782.jpg - Removing!
Duplicate found: sugi_136811.jpg - Removing!
Duplicate found: sugi_136816.jpg - Removing!
Duplicate found: sugi_136826.jpg - Removing!
Duplicate found: sugi_136832.jpg - Removing!
Duplicate found: sugi_136836.jpg - Removing!
Duplicate found: sugi_136841.jpg - Removing!
Duplicate found: sugi_136845.jpg - Removing!
Duplicate found: sugi_136850.jpg - Removing!
Duplicate found: sugi_136856.jpg - Removing!
Duplicate found: sugi_136860.jpg - Removing!
Duplicate found: sugi_136864.jpg - Removing!
Duplicate found: sugi_136869.jpg - Removing!
Duplicate found: sugi_136873.jpg - Removing!
Duplicate found: sugi_136878.jpg - Removing!
Duplicate found: sugi_136882.jpg - Removing!
Duplicate found: sugi_136886.jpg - Removing!
Duplicate found: sugi_136892.jpg - Removing!
Duplicate 