# Generating the Actual Dataset for the Damage Detection Phase

This script applies one random, reproducible damage effect (e.g., scratches, water discoloration, or craquelure) to each image in a dataset. It uses multiprocessing to speed up processing across multiple CPU cores, saving both the damaged image and its corresponding damage mask. A metadata file is generated to track the original image, damage type applied, and output file paths.

In [1]:
import os
import sys
project_root = os.path.abspath("..")
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

import json
import random
import hashlib
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
from PIL import Image, ImageDraw, ImageFilter
from tqdm import tqdm

# Import damage classes
from maskerada import ScratchDamage, WaterDiscolouration, CraquelureDamage

In [2]:
# --- Utility: Generate a reproducible seed per image path ---
def get_deterministic_seed(image_path):
    """
    Generates a reproducible integer seed from the image path.

    This ensures consistent random behavior per image across runs,
    even when processed in parallel.

    Args:
        image_path (str): Full path to the image.

    Returns:
        int: A 32-bit integer seed derived from the image path.
    """
    return int(hashlib.sha256(image_path.encode())
               .hexdigest(), 16) % (2**32)

In [3]:
def process_image(args):
    """
    Processes a single image by applying one randomly selected damage type.
    Skips processing if output files already exist. Returns metadata if 
    successful.

    Args:
        args (tuple): Contains:
            - image_path (str): Path to the input image.
            - base_output_directory (str): Where to save results.
            - damage_classes (list): List of available damage classes.

    Returns:
        dict or None: Metadata dictionary if processed, None if skipped or 
        errored.
    """
    image_path, base_output_directory, damage_classes = args

    try:
        seed = int(hashlib.sha256(image_path.encode())
                   .hexdigest(), 16) % (2**32)
        random.seed(seed)

        damage_class = random.choice(damage_classes)
        damage_type_name = damage_class.__name__

        image_identifier = os.path.splitext(os.path.basename(image_path))[0]
        original_image_filename = os.path.basename(image_path)
        damage_type_safe = "".join(c if c
                                   .isalnum() else "_" for c in damage_type_name)

        base_filename = f"{image_identifier}-{damage_type_safe}"
        damaged_output_dir = os.path.join(base_output_directory, 
                                          "generated-damaged-images")
        damage_mask_dir = os.path.join(base_output_directory, 
                                       "generated-damage-masks")

        damaged_image_path = os.path.join(damaged_output_dir, 
                                          f"{base_filename}.png")
        mask_image_path = os.path.join(damage_mask_dir, 
                                       f"{base_filename}-mask.png")

        if (os.path.exists(damaged_image_path) 
            and os.path.exists(mask_image_path)):
            return None  # Skip if already processed

        original_image = Image.open(image_path).convert("RGB")
        damage_instance = damage_class()
        damaged_image, damage_mask = damage_instance.apply(original_image
                                                           .copy())

        os.makedirs(damaged_output_dir, exist_ok=True)
        os.makedirs(damage_mask_dir, exist_ok=True)

        damaged_image.save(damaged_image_path)
        damage_mask.save(mask_image_path)

        return {
            "image_identifier": image_identifier,
            "original_image_filename": original_image_filename,
            "damage_type": damage_type_name,
            "original_image_path": image_path,
            "damaged_image_path": damaged_image_path,
            "mask_image_path": mask_image_path
        }

    except Exception as e:
        print(f"[ERROR] {original_image_filename}: {e}")
        return None


In [4]:
def process_all_images(
    image_directory,
    base_output_directory,
    metadata_output_path,
    damage_classes,
    sample_size=None,
    max_workers=None
):
    """
    Applies one random damage type to each image in the directory using 
    multiprocessing. Skips already-processed images and writes valid JSON 
    array metadata at the end.

    Args:
        image_directory (str): Folder of input images.
        base_output_directory (str): Where to save damaged images and masks.
        metadata_output_path (str): Path to the output .json metadata file.
        damage_classes (list): List of available damage class types.
        sample_size (int, optional): Limit number of images processed.
        max_workers (int, optional): Max number of parallel workers.

    Returns:
        None
    """
    valid_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')
    image_paths = [
        os.path.join(image_directory, fname)
        for fname in os.listdir(image_directory)
        if fname.lower().endswith(valid_extensions)
    ]

    if sample_size:
        rng = random.Random(42)
        image_paths = rng.sample(image_paths, k=sample_size)

    all_args = [
        (image_path, base_output_directory, damage_classes)
        for image_path in image_paths
    ]

    metadata = []

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor
                   .submit(process_image, args): args for args in all_args}
        for future in tqdm(as_completed(futures), total=len(futures), 
                           desc="Processing"):
            result = future.result()
            if result is not None:
                metadata.append(result)

    with open(metadata_output_path, "w") as f:
        json.dump(metadata, f, indent=2)

    print(f"\nFinished processing {len(metadata)} paintings.")
    print(f"Saved metadata to '{metadata_output_path}'.")


In [5]:
process_all_images(
    image_directory="../data/img/full",
    base_output_directory="../data/multiprocessing-sample-damaged-and-mask",
    metadata_output_path="../data/damage_metadata.json",
    damage_classes=[ScratchDamage, WaterDiscolouration, CraquelureDamage],
    sample_size=50,
    max_workers=None
)

Processing: 100%|██████████| 50/50 [00:19<00:00,  2.53it/s]



Finished processing 50 paintings.
Saved metadata to '../data/damage_metadata.json'.
