# Generating the Actual Dataset for the Damage Detection Phase

This script applies one random, reproducible damage effect (e.g., scratches, water discoloration, or craquelure) to each image in a dataset. It uses multiprocessing to speed up processing across multiple CPU cores, saving both the damaged image and its corresponding damage mask. A metadata file is generated to track the original image, damage type applied, and output file paths.

In [1]:
import os
import sys
import json
import random
from pathlib import Path
import shutil
import hashlib
from concurrent.futures import ProcessPoolExecutor, as_completed

# Add src/ to sys.path
project_root = os.path.abspath("../..")
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

import numpy as np
from PIL import Image, ImageDraw, ImageFilter
from tqdm import tqdm

# Local import
from maskerada import (ScratchDamage, WaterDiscolouration, 
                       CraquelureDamage, NoDamage)

In [2]:
def get_deterministic_seed(image_path):
    """
    Generate a deterministic integer seed from an image path using SHA-256 hash.

    Args:
        image_path (str): Path to the input image file.

    Returns:
        int: 32-bit seed value.
    """
    return int(hashlib.sha256(image_path.encode()).hexdigest(), 16) % (2**32)

In [3]:
def process_image(args):
    """
    Applies a randomly selected damage class to an image and saves the damaged image and mask.

    Args:
        args (tuple): (image_path, base_output_directory, damage_classes)

    Returns:
        dict or None: Metadata dictionary if successful, else None.
    """
    image_path, base_output_directory, damage_classes = args
    original_image_filename = os.path.basename(image_path)
    image_identifier = Path(original_image_filename).stem

    try:
        # Deterministic randomness for reproducibility
        seed = get_deterministic_seed(image_path)
        random.seed(seed)

        image = Image.open(image_path).convert("RGB")

        # Support both classes and factories
        damage_class = random.choice(damage_classes)
        damage_instance = damage_class() if callable(damage_class) else damage_class

        # Apply damage
        damaged_image, damage_mask = damage_instance.apply(image)

        if damaged_image is None or damage_mask is None:
            print(f"[WARNING] {damage_instance.__class__.__name__} returned None for {original_image_filename}")
            return None

        # Use lowercase damage type name in filename
        damage_type_name = damage_instance.__class__.__name__
        damage_suffix = damage_type_name.lower().replace("damage", "")

        # Flat output directories
        damage_img_dir = os.path.join(base_output_directory, "generated-damaged-images")
        damage_mask_dir = os.path.join(base_output_directory, "generated-damage-masks")
        os.makedirs(damage_img_dir, exist_ok=True)
        os.makedirs(damage_mask_dir, exist_ok=True)

        base_filename = f"{image_identifier}-{damage_suffix}"
        damaged_image_path = os.path.join(damage_img_dir, base_filename + ".png")
        damage_mask_path = os.path.join(damage_mask_dir, base_filename + "-mask.png")

        damaged_image.convert("RGB").save(damaged_image_path)
        damage_mask.save(damage_mask_path)

        return {
            "filename": original_image_filename,
            "damage_type": damage_type_name,
            "damaged_image": damaged_image_path,
            "damage_mask": damage_mask_path
        }

    except Exception as e:
        print(f"[ERROR] {original_image_filename}: {e}")
        return None

In [4]:
def process_all_images(
    image_directory,
    base_output_directory,
    metadata_output_path,
    damage_classes,
    sample_size=None,
    max_workers=None
):
    """
    Applies a random damage class to all images in a directory using multiprocessing.

    Args:
        image_directory (str): Directory with source images.
        base_output_directory (str): Where to save processed images and masks.
        metadata_output_path (str): Path to output JSON metadata.
        damage_classes (list): List of class objects or callable factories.
        sample_size (int or None): Limit number of images processed.
        max_workers (int or None): Number of processes to spawn.
    """
    image_directory = Path(image_directory)
    image_paths = sorted([p for p in image_directory.glob("*") if p.suffix.lower() in {".jpg", ".jpeg", ".png"}])

    if sample_size:
        random.shuffle(image_paths)
        image_paths = image_paths[:sample_size]

    print(f"Found {len(image_paths)} images to process.")

    task_args = [(str(path), base_output_directory, damage_classes) for path in image_paths]
    results = []

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_image, args): args[0] for args in task_args}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing", unit="img"):
            try:
                result = future.result()
                if result:
                    results.append(result)
            except Exception as e:
                print(f"[ERROR] Failed to process {futures[future]}: {e}")

    with open(metadata_output_path, "w") as f:
        json.dump(results, f, indent=2)

    print(f"\nDone. {len(results)} images processed successfully.")

In [5]:
def create_craquelure():
    """
    Factory function for CraquelureDamage to ensure it's pickleable in multiprocessing.
    """
    return CraquelureDamage(crack_mask_dir="../../data/crack-masks")

In [6]:
process_all_images(
    image_directory="../../data/img/",
    base_output_directory="../../data/v5-damaged-and-mask-dataset",
    metadata_output_path="../../data/v5-damage_metadata.json",
    damage_classes=[
        ScratchDamage,
        WaterDiscolouration,
        create_craquelure,
        NoDamage
    ],
    sample_size=None,
    max_workers=None
)

Found 34581 images to process.


Processing: 100%|██████████| 34581/34581 [16:08<00:00, 35.69img/s]



Done. 34581 images processed successfully.
