# Image Dataset Preprocessing

**Librerias Necesarias**

In [1]:
import os
from PIL import Image

**Detección de Imágenes Corruptas/No válidas**

Esta sección revisa si existen imágenes dentro del dataset que no son válidas o que esten corruptas.
Las que sean detectadas seran reemplazadas manualmente por otras imágenes.

In [4]:
def is_image_corrupted(image_path):
    """
    Checks if an image file is corrupted by attempting to open and verify it.

    Parameters:
    image_path (str): The path to the image file.

    Returns:
    bool: True if the image is corrupted, False otherwise.
    """
    try:
        with Image.open(image_path) as img:
            img.verify()  # Verify does not decode the image, but ensures it's intact
        return False  # Image is not corrupted
    except (IOError, SyntaxError) as e:
        print(f"\nCorrupted: {image_path} - {e}")
        return True  # Image is corrupted


def check_images_in_folder(main_folder):
    """
    Check for corrupted images in a given folder.

    Args:
        main_folder (str): The path to the main folder containing the images.

    Returns:
        None

    Prints the number of corrupted images found and their file paths, if any.
    """
    corrupted_images = []
    for root, _, files in os.walk(main_folder):
        for file in files:
            image_path = os.path.join(root, file)
            if is_image_corrupted(image_path):
                corrupted_images.append(image_path)

    if corrupted_images:
        print(f"\nFound {len(corrupted_images)} corrupted images.")
        for img in corrupted_images:
            print(img)
    else:
        print("No corrupted images found.")


# Check images in the 'dinosaurs' folder
main_folder_path = 'dinosaurs'
check_images_in_folder(main_folder_path)

No corrupted images found.


**WEBP Image Convertion**

Esta sección detecta y convierte archivos con extensión WEBP a formato PNG para que sean editables.
Al convertirlas es posible hacer un corte manual de aquellas imágenes que lo requieran.

In [3]:
def convert_and_replace_webp_with_png(main_folder):
    """
    Converts and replaces all WebP files in the specified folder with PNG files.

    Args:
        main_folder (str): The path to the main folder containing the WebP files.
    """
    target_extension = '.webp'
    replaced_count = 0  # Initialize the counter

    for root, _, files in os.walk(main_folder):
        for file in files:
            file_lower = file.lower()
            if file_lower.endswith(target_extension):
                file_path = os.path.join(root, file)
                png_path = os.path.splitext(file_path)[0] + '.png'
                try:
                    with Image.open(file_path) as img:
                        img = img.convert('RGBA')  # Convert to a standard color mode
                        img.save(png_path, 'PNG')
                    os.remove(file_path)  # Remove the original WebP file
                    replaced_count += 1  # Increment the counter
                    print(f"Converted and replaced: {file_path} to {png_path}")
                except Exception as e:
                    print(f"Failed to convert {file_path}: {e}")

    # Print the total count of replaced files
    print(f"\nTotal WebP files replaced: {replaced_count}")

# Convert and replace WebP files with PNG in the 'dinosaurs' folder
main_folder_path = 'dinosaurs'
convert_and_replace_webp_with_png(main_folder_path)

Converted and replaced: dinosaurs\Allosaurus\20221202151219_1200x1200.webp to dinosaurs\Allosaurus\20221202151219_1200x1200.png
Converted and replaced: dinosaurs\Allosaurus\3d-rendered-illustration-allosaurus-600nw-2114901794.webp to dinosaurs\Allosaurus\3d-rendered-illustration-allosaurus-600nw-2114901794.png
Converted and replaced: dinosaurs\Allosaurus\779622605-9352e5212292af073a8e39722078e0ddc56ea567b4ab6c6012835ef91a8701bd-d.webp to dinosaurs\Allosaurus\779622605-9352e5212292af073a8e39722078e0ddc56ea567b4ab6c6012835ef91a8701bd-d.png
Converted and replaced: dinosaurs\Allosaurus\AlloNull.webp to dinosaurs\Allosaurus\AlloNull.png
Converted and replaced: dinosaurs\Allosaurus\Allorender.webp to dinosaurs\Allosaurus\Allorender.png
Converted and replaced: dinosaurs\Allosaurus\Allosaurus-jaws-Theropoda-flesh-teeth-meat-eating-dinosaurs.webp to dinosaurs\Allosaurus\Allosaurus-jaws-Theropoda-flesh-teeth-meat-eating-dinosaurs.png
Converted and replaced: dinosaurs\Allosaurus\allosaurus-prehis

**Image Padding and Resizing**

In [2]:
import os
from PIL import Image

def process_images(base_input_folder, base_output_folder):
    
    for folder_name in os.listdir(base_input_folder):
        input_folder = os.path.join(base_input_folder, folder_name)
        if os.path.isdir(input_folder):
            output_folder = os.path.join(base_output_folder, folder_name)
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)

            # List and open images with the specified extensions
            images = [
                Image.open(os.path.join(input_folder, f)) 
                for f in os.listdir(input_folder) 
            ]

            for index, image in enumerate(images, start=1):
                max_side = max(image.width, image.height)

                # Calculate padding to make the image square
                horizontal_padding = (max_side - image.width) // 2
                vertical_padding = (max_side - image.height) // 2

                # Create a white square image with the max dimension
                padded_image = Image.new('RGB', (max_side, max_side), (255, 255, 255))
                padded_image.paste(image, (int(horizontal_padding), int(vertical_padding)))

                # Resize to the desired 100x100 pixels
                resized_image = padded_image.resize((100, 100))

                # Save the processed image in BMP format
                image_name = f"{folder_name}_{index}.bmp"
                resized_image.save(os.path.join(output_folder, image_name), 'BMP')
    
    print("Image processing complete.")

# Update these paths as necessary
base_input_folder = r'dinosaurs'
base_output_folder = r'dinosaurs_processed'
process_images(base_input_folder, base_output_folder)



Image processing complete.
