# Image Dataset Preprocessing

**Librerias Necesarias**

In [1]:
import os
from PIL import Image, UnidentifiedImageError
import splitfolders
import shutil

**Detección de Imágenes Corruptas/No válidas**

Esta sección revisa si existen imágenes dentro del dataset que no son válidas o que esten corruptas.
Las que sean detectadas seran reemplazadas manualmente por otras imágenes.

In [2]:
def is_image_corrupted(image_path):
    """
    Checks if an image file is corrupted by attempting to open and verify it.

    Parameters:
    image_path (str): The path to the image file.

    Returns:
    bool: True if the image is corrupted, False otherwise.
    """
    try:
        with Image.open(image_path) as img:
            img.verify()  # Verify does not decode the image, but ensures it's intact
        return False  # Image is not corrupted
    except (IOError, SyntaxError) as e:
        print(f"\nCorrupted: {image_path} - {e}")
        return True  # Image is corrupted


def check_images_in_folder(main_folder):
    """
    Check for corrupted images in a given folder.

    Args:
        main_folder (str): The path to the main folder containing the images.

    Returns:
        None

    Prints the number of corrupted images found and their file paths, if any.
    """
    corrupted_images = []
    for root, _, files in os.walk(main_folder):
        for file in files:
            image_path = os.path.join(root, file)
            if is_image_corrupted(image_path):
                corrupted_images.append(image_path)

    if corrupted_images:
        print(f"\nFound {len(corrupted_images)} corrupted images.")
        for img in corrupted_images:
            print(img)
    else:
        print("No corrupted images found.")


In [3]:
# Check images in the 'dinosaurs' folder
main_folder_path = 'dataset/dinosaurs'
check_images_in_folder(main_folder_path)

No corrupted images found.


**WEBP Image Convertion**

Esta sección detecta y convierte archivos con extensión WEBP a formato PNG para que sean editables.
Al convertirlas es posible hacer un corte manual de aquellas imágenes que lo requieran.

In [4]:
def convert_and_replace_webp_with_png(main_folder):
    """
    Converts and replaces all WebP files in the specified folder with PNG files.

    Args:
        main_folder (str): The path to the main folder containing the WebP files.
    """
    target_extension = '.webp'
    replaced_count = 0  # Initialize the counter

    for root, _, files in os.walk(main_folder):
        for file in files:
            file_lower = file.lower()
            if file_lower.endswith(target_extension):
                file_path = os.path.join(root, file)
                png_path = os.path.splitext(file_path)[0] + '.png'
                try:
                    with Image.open(file_path) as img:
                        img = img.convert('RGBA')  # Convert to a standard color mode
                        img.save(png_path, 'PNG')
                    os.remove(file_path)  # Remove the original WebP file
                    replaced_count += 1  # Increment the counter
                    print(f"Converted and replaced: {file_path} to {png_path}")
                except Exception as e:
                    print(f"Failed to convert {file_path}: {e}")

    # Print the total count of replaced files
    print(f"\nTotal WebP files replaced: {replaced_count}")

In [5]:
# Convert and replace WebP files with PNG in the 'dinosaurs' folder
main_folder_path = 'dinosaurs'
convert_and_replace_webp_with_png(main_folder_path)


Total WebP files replaced: 0


**Class Balance Verification**

In [6]:
def count_images_by_format(folder_path):
    """
    Counts the number of images in each subfolder of the given folder path, grouped by file format.

    Args:
        folder_path (str): The path to the folder containing subfolders with images.

    Returns:
        dict: A dictionary containing the counts of images for each subfolder and file format.
              The dictionary has the following structure:
              {
                  subfolder1: {
                      file_extension1: count1,
                      file_extension2: count2,
                      ...
                  },
                  ...
              }
    """
    counts = {}
    for subfolder in os.listdir(folder_path):
        subfolder_path = os.path.join(folder_path, subfolder)
        if os.path.isdir(subfolder_path):
            counts[subfolder] = {}
            for file in os.listdir(subfolder_path):
                file_extension = os.path.splitext(file)[1].lower()
                counts[subfolder][file_extension] = counts[subfolder].get(file_extension, 0) + 1
    return counts

In [7]:
# Count images by format in the 'dinosaurs' folder
folder_path = 'dataset/dinosaurs'
counts = count_images_by_format(folder_path)
for subfolder, formats in counts.items():
    print(f"\n{subfolder}:")
    for format, count in formats.items():
        print(f"  {format}: {count}")
    total = sum(formats.values())
    if total == 100:
        print(f"Succes: All images are present. \nTotal: {total} images")
    else:
        print(f"Error: {total} images counted, missing {100 - total} images.")


Allosaurus:
  .jpeg: 5
  .jpg: 60
  .png: 35
Succes: All images are present. 
Total: 100 images

Ankylosaurus:
  .jpg: 53
  .png: 45
  .jpeg: 2
Succes: All images are present. 
Total: 100 images

Baryonyx:
  .png: 40
  .jpg: 56
  .jpeg: 4
Succes: All images are present. 
Total: 100 images

Brachiosaurus:
  .jpg: 69
  .png: 29
  .jpeg: 2
Succes: All images are present. 
Total: 100 images

Carnotaurus:
  .jpg: 61
  .png: 36
  .jpeg: 3
Succes: All images are present. 
Total: 100 images

Corythosaurus:
  .jpg: 64
  .jpeg: 2
  .png: 34
Succes: All images are present. 
Total: 100 images

Dilophosaurus:
  .jpg: 62
  .png: 35
  .jpeg: 3
Succes: All images are present. 
Total: 100 images

Pachycephalosaurus:
  .jpg: 66
  .png: 30
  .jpeg: 4
Succes: All images are present. 
Total: 100 images

Parasaurolophus:
  .png: 31
  .jpg: 67
  .jpeg: 2
Succes: All images are present. 
Total: 100 images

Pteranodon:
  .jpeg: 2
  .jpg: 67
  .png: 31
Succes: All images are present. 
Total: 100 images

Spinos

**Image padding, resizing and conversion to BMP**

In [8]:
def process_images(original_folder_path, output_folder_path, max_subfolders=None):
    """
    Process images within subfolders of the specified folder by renaming, converting, padding to square,
    and resizing to 100x100 pixels. The processed images will be stored in a new folder.

    Args:
        original_folder_path (str): Path to the original folder containing subfolders with images.
        output_folder_path (str): Path to the output folder that will contain the processed images.
        max_subfolders (int, optional): Maximum number of subfolders to process. If None, process all subfolders.

    Raises:
        Exception: If any error occurs during the processing of images.
    """
    try:
        # Create the output folder structure mirroring the original
        if not os.path.exists(output_folder_path):
            os.makedirs(output_folder_path)

        # Delete the output folder if it already exists
        else:
            shutil.rmtree(output_folder_path)
            os.makedirs(output_folder_path)

        # Initialize the counter for processed subfolders
        processed_subfolders = 0

        # Process each subfolder in the original folder
        for root, dirs, files in os.walk(original_folder_path):
            if max_subfolders is not None and processed_subfolders >= max_subfolders:
                break

            rel_path = os.path.relpath(root, original_folder_path)
            current_output_dir = os.path.join(output_folder_path, rel_path)
            
            if not os.path.exists(current_output_dir):
                os.makedirs(current_output_dir)

            if root != original_folder_path:
                subfolder_name = os.path.basename(root)

                print(f"Processing folder: {subfolder_name}")

                # Process each image file in the subfolder
                for index, file_name in enumerate(files):
                    file_path = os.path.join(root, file_name)

                    if os.path.isfile(file_path):
                        try:
                            # Open the image
                            with Image.open(file_path) as img:

                                # Replace transparency with white
                                if img.mode in ('RGBA', 'LA'):
                                    background = Image.new(img.mode[:-1], img.size, (255, 255, 255))
                                    background.paste(img, img.split()[-1])
                                    img = background
                                elif img.mode == 'P':
                                    img = img.convert('RGB')

                                # Padding to make the image square
                                width, height = img.size
                                if width != height:
                                    max_dim = max(width, height)
                                    new_img = Image.new("RGB", (max_dim, max_dim), (255, 255, 255))
                                    new_img.paste(img, ((max_dim - width) // 2, (max_dim - height) // 2))
                                    img = new_img

                                # Resize to 100x100 pixels using LANCZOS (formerly ANTIALIAS)
                                img = img.resize((100, 100), Image.Resampling.LANCZOS)

                                # Rename the file according to the subfolder name and index
                                new_file_name = f"{subfolder_name}_{index}.bmp"
                                new_file_path = os.path.join(current_output_dir, new_file_name)

                                # Save the converted image as .bmp
                                img.save(new_file_path, format='BMP')

                                print(f"Processed: {new_file_name}")

                        except UnidentifiedImageError as e:
                            raise Exception(f"Error processing {file_name}: Unidentified image file. {e}")
                
                print(f"Finished processing folder: {subfolder_name}\n")
                processed_subfolders += 1

    except Exception as e:
        raise Exception(f"An error occurred while processing images: {e}")

In [9]:
# Process images in the 'dinosaurs' folder and save the processed images in a new folder
process_images('dataset/dinosaurs', 'dataset/processed', max_subfolders=3)

Processing folder: Allosaurus
Processed: Allosaurus_0.bmp
Processed: Allosaurus_1.bmp
Processed: Allosaurus_2.bmp
Processed: Allosaurus_3.bmp
Processed: Allosaurus_4.bmp
Processed: Allosaurus_5.bmp
Processed: Allosaurus_6.bmp
Processed: Allosaurus_7.bmp
Processed: Allosaurus_8.bmp
Processed: Allosaurus_9.bmp
Processed: Allosaurus_10.bmp
Processed: Allosaurus_11.bmp
Processed: Allosaurus_12.bmp
Processed: Allosaurus_13.bmp
Processed: Allosaurus_14.bmp
Processed: Allosaurus_15.bmp
Processed: Allosaurus_16.bmp
Processed: Allosaurus_17.bmp
Processed: Allosaurus_18.bmp
Processed: Allosaurus_19.bmp
Processed: Allosaurus_20.bmp
Processed: Allosaurus_21.bmp
Processed: Allosaurus_22.bmp
Processed: Allosaurus_23.bmp
Processed: Allosaurus_24.bmp
Processed: Allosaurus_25.bmp
Processed: Allosaurus_26.bmp
Processed: Allosaurus_27.bmp
Processed: Allosaurus_28.bmp
Processed: Allosaurus_29.bmp
Processed: Allosaurus_30.bmp
Processed: Allosaurus_31.bmp
Processed: Allosaurus_32.bmp
Processed: Allosaurus_3

**Image Processing Validation**

In [10]:
check_images_in_folder('dataset/processed')

No corrupted images found.


In [11]:
counts = count_images_by_format('dataset/processed')

for subfolder, formats in counts.items():
    print(f"\n{subfolder}:")
    for format, count in formats.items():
        print(f"  {format}: {count}")
    total = sum(formats.values())
    if total == 100:
        print(f"Succes: All images are present. \nTotal: {total} images")
    else:
        print(f"Error: {total} images found, missing {100 - total} images.")


Allosaurus:
  .bmp: 100
Succes: All images are present. 
Total: 100 images

Ankylosaurus:
  .bmp: 100
Succes: All images are present. 
Total: 100 images

Baryonyx:
  .bmp: 100
Succes: All images are present. 
Total: 100 images


## Data Split

In [12]:
def clear_directory(directory):
    """
    Clears the specified directory by removing all its contents.

    Args:
        directory (str): The path to the directory to be cleared.

    Returns:
        None
    """
    if os.path.exists(directory):
        shutil.rmtree(directory)
    os.makedirs(directory)

def split_dataset(input_folder, output_folder):
    """
    Split the images in the input folder into training and testing sets, and overwrite the images in the output folder.

    Parameters:
    input_folder (str): The path to the folder containing the input images.
    output_folder (str): The path to the folder where the split images will be saved.

    Returns:
    None
    """
    # Clear the target directory
    clear_directory(output_folder)

    # Perform the split
    splitfolders.ratio(input_folder, output=output_folder, seed=1337, ratio=(0.7, 0.1, 0.2), group_prefix=None)
    

In [13]:
# Paths to the main folders
processed_data_folder = "dataset/processed"

# Output directories
output_processed_split = "dataset/split"

# Split processed data and overwrite
split_dataset(processed_data_folder, output_processed_split)

print("Data splitting completed and target directories have been overwritten.")

Data splitting completed and target directories have been overwritten.


In [14]:
# Check the number of images in the split folders raw_split and processed_split
def validate_split(input_folder, folder_name):
    """
    Validate the split of images in the specified folder.

    Args:
        input_folder (str): Path to the folder containing the split images.

    Returns:
        None
    """
    print("-"*10,folder_name,"-"*10)
    for split in os.listdir(input_folder):
        subfolder_path = os.path.join(input_folder, split)
        if os.path.isdir(subfolder_path):
            print(f"\n{split}:")
            counts = count_images_by_format(subfolder_path)
            for subfolder, formats in counts.items():
                print(f"\n{subfolder}:")
                for format, count in formats.items():
                    print(f"  {format}: {count}")
                total = sum(formats.values())
                if "train" in subfolder_path and total == 70:
                    print(f"Succes: All images are present in training split \nTotal: {total} images")

                elif "test" in subfolder_path and total == 20:
                    print(f"Succes: All images are present in testing split \nTotal: {total} images")

                elif "val" in subfolder_path and total == 10:
                    print(f"Succes: All images are present in validation split \nTotal: {total} images")
                    
                else:
                    print(f"Error: {total} images found, split not balanced.")

In [15]:
# Validate the split of images in the processed_split folder
validate_split(output_processed_split, "Processed Split")
print("\n\n")
check_images_in_folder(output_processed_split)

---------- Processed Split ----------

test:

Allosaurus:
  .bmp: 20
Succes: All images are present in testing split 
Total: 20 images

Ankylosaurus:
  .bmp: 20
Succes: All images are present in testing split 
Total: 20 images

Baryonyx:
  .bmp: 20
Succes: All images are present in testing split 
Total: 20 images

train:

Allosaurus:
  .bmp: 70
Succes: All images are present in training split 
Total: 70 images

Ankylosaurus:
  .bmp: 70
Succes: All images are present in training split 
Total: 70 images

Baryonyx:
  .bmp: 70
Succes: All images are present in training split 
Total: 70 images

val:

Allosaurus:
  .bmp: 10
Succes: All images are present in validation split 
Total: 10 images

Ankylosaurus:
  .bmp: 10
Succes: All images are present in validation split 
Total: 10 images

Baryonyx:
  .bmp: 10
Succes: All images are present in validation split 
Total: 10 images



No corrupted images found.
