In [2]:
import os
import hashlib
from PIL import Image

In [4]:


def calculate_image_hash(image):
    """
    Calculates the hash of an image.
    
    Parameters:
        image (PIL.Image): The image to hash.
        
    Returns:
        str: The hash of the image.
    """
    # Convert image to a consistent format (like RGB) and resize
    image = image.convert("RGB").resize((224, 224))
    
    # Calculate hash
    return hashlib.md5(image.tobytes()).hexdigest()

def clean_images(directory, img_size=(224, 224), valid_formats=("jpeg", "jpg", "png")):
    """
    Cleans images in the specified directory, including removing duplicates.
    
    Parameters:
        directory (str): The directory containing images to clean.
        img_size (tuple): The desired image size (width, height).
        valid_formats (tuple): A tuple of valid image formats (extensions).
    """
    # Track hashes to identify duplicates
    seen_hashes = {}
    
    for subdir, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(subdir, file)
            try:
                # Check if the file is an image
                if not file.lower().endswith(valid_formats):
                    print(f"Removing non-image file: {file_path}")
                    os.remove(file_path)
                    continue

                # Open the image file
                with Image.open(file_path) as img:
                    
                    # Calculate hash
                    img_hash = calculate_image_hash(img)
                    
                    # Check if this hash has been seen before
                    if img_hash in seen_hashes:
                        print(f"Removing duplicate image: {file_path}")
                        os.remove(file_path)
                    else:
                        # Store the hash and resize the image if needed
                        seen_hashes[img_hash] = file_path
                        if img.size != img_size:
                            print(f"Resizing image: {file_path}")
                            img = img.resize(img_size)
                            img.save(file_path)
                        
            except Exception as e:
                print(f"Removing corrupted or invalid image: {file_path}")
                os.remove(file_path)




In [5]:
# Example usage
dir = r"C:\Mushroom\data"
 
# Clean images in the training and test directories, including removing duplicates
clean_images(dir, img_size=(224, 224))
 

Resizing image: C:\Mushroom\data\Agaricus bisporus\1.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\10.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\100.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1000.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1001.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1002.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1003.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1004.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1005.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1007.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1008.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1009.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\101.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1010.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1011.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\1012.png
Resizing image: C:\Mushroom\data\Agaricus bispo



Resizing image: C:\Mushroom\data\Agaricus bisporus\638.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\639.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\64.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\640.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\641.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\642.png
Removing duplicate image: C:\Mushroom\data\Agaricus bisporus\643.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\644.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\645.png
Removing duplicate image: C:\Mushroom\data\Agaricus bisporus\646.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\647.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\648.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\649.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\65.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\650.png
Resizing image: C:\Mushroom\data\Agaricus bisporus\651.png
Resizing image: C:\Mushroom\data\Agari