# Apply Damage and Save Dataset
Run the function below to apply the damages per image and save the damaged version and their respective masks in their corresponding directory.

In [38]:
# Ensure src is in the import path
import os
import sys

project_root = os.path.abspath("..")
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

In [39]:
import json
import random
import numpy as np
from PIL import Image, ImageDraw, ImageFilter

# Import damage classes
from maskerada import ScratchDamage, WaterDiscolouration, CraquelureDamage, NoDamage

In [40]:
def apply_and_save_damage_set(
    original_image_pil,
    image_identifier,
    damage_applicator_instance,
    damage_type_name,
    base_output_directory
):
    """
    Applies a given damage to an image, saves the damaged image into
    a subdirectory.
    """
    print(f"Applying {damage_type_name} damage to '{image_identifier}'...")

    image_to_damage = original_image_pil.copy()
    damaged_image, damage_mask = damage_applicator_instance.apply(image_to_damage)

    # Define output subdirectories
    damaged_output_dir = os.path.join(
        base_output_directory, "generated-damaged-images"
    )
    os.makedirs(damaged_output_dir, exist_ok=True)

    damage_mask_dir = os.path.join(
        base_output_directory, "generated-damage-masks"
    )
    os.makedirs(damage_mask_dir, exist_ok=True)

    # Prepare filename
    id_str = str(image_identifier)
    id_str_base, id_str_ext = os.path.splitext(id_str)

    if id_str_ext.lower() in (
        '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'
    ):
        id_str = id_str_base

    damage_type_name = "".join(
        c if c.isalnum() else '_' for c in damage_type_name
    )
    base_filename = f"{id_str}-{damage_type_name}"
    damaged_filename = f"{base_filename}.png"
    damaged_image_path = os.path.join(damaged_output_dir, damaged_filename)
    mask_filename = f"{base_filename}-mask.png"
    mask_image_path = os.path.join(damage_mask_dir, mask_filename)

    # Save damaged image
    try:
        damaged_image.save(damaged_image_path)
        damage_mask.save(mask_image_path)
    except Exception as e:
        print(
            f"  Error saving images for {damage_type_name} "
            f"on {image_identifier}: {e}"
        )

    return {
        "image_identifier": image_identifier,
        "damage_type": damage_type_name,
        "original_image_path": (
            original_image_pil.filename
            if hasattr(original_image_pil, 'filename') else None
        ),
        "damaged_image_path": damaged_image_path,
        "mask_image_path": mask_image_path,
    }


Note that the function above was modified from its original to include a `return` statement for the image file paths and remove the references to the mask.

**Original Function:**
```
import random
import numpy as np
from PIL import Image, ImageDraw, ImageFilter
import os

def apply_and_save_damage_set(original_image_pil, image_identifier,
                              damage_applicator_instance, damage_type_name,
                              base_output_directory):
    """
    Applies a given damage to an image, saves the damaged image and its mask
    into respective subdirectories.

    Args:
        original_image_pil (PIL.Image): The original clean image.
        image_identifier (str or int): A unique identifier for the image (e.g., "Image_001", 
                                     a filename like "hash.jpg", or an index).
        damage_applicator_instance (object): An instance of a damage class (e.g., TearDamage()).
        damage_type_name (str): A string to identify the damage type in filenames (e.g., "Tears").
        base_output_directory (str): Base directory to save the output images.
                                     Subdirectories 'damaged-images' and 'masks' will be created here.
    """
    print(f"Applying {damage_type_name} damage to '{image_identifier}'...")
    
    image_to_damage = original_image_pil.copy() 
    damaged_image, damage_mask = damage_applicator_instance.apply(image_to_damage)

    # Define output subdirectories
    damaged_output_dir = os.path.join(base_output_directory, "damaged-images")
    mask_output_dir = os.path.join(base_output_directory, "masks")

    # Create subdirectories if they don't exist
    os.makedirs(damaged_output_dir, exist_ok=True)
    os.makedirs(mask_output_dir, exist_ok=True)

    # Prepare image identifier for filename (strip extension if it's a filename)
    id_str = str(image_identifier)
    id_str_base, id_str_ext = os.path.splitext(id_str)
    if id_str_ext.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']:
        id_str = id_str_base # Use only the base name without the extension
    
    # Sanitize damage_type_name for filename
    safe_damage_type_name = "".join(c if c.isalnum() else '_' for c in damage_type_name)
        
    base_filename = f"{id_str} {safe_damage_type_name}" 

    damaged_filename = f"{base_filename}.png"
    mask_filename = f"{base_filename}_mask.png"

    damaged_image_path = os.path.join(damaged_output_dir, damaged_filename)
    mask_image_path = os.path.join(mask_output_dir, mask_filename)

    # Save the images
    try:
        damaged_image.save(damaged_image_path)
        damage_mask.save(mask_image_path)
    except Exception as e:
        print(f"  Error saving images for {damage_type_name} on {image_identifier}: {e}")

```

## Test with a sample image

In [5]:
# Load your original image
original_image = Image.open("../data/sample-image-canvas-oil-Le-Moulin-de-la-Galette-1876.png")

# Define damage instances
damage_types = [
    ScratchDamage(),
    WaterDiscolouration(),
    CraquelureDamage(),
    NoDamage()
]

# Output base directory
base_output_directory = "../data/sample-damaged-and-mask-data"

# Metadata collector
damage_metadata = []

# Apply each damage separately
for dmg_instance in damage_types:
    result = apply_and_save_damage_set(
        original_image_pil=original_image,
        image_identifier="your_image",
        damage_applicator_instance=dmg_instance,
        damage_type_name=dmg_instance.__class__.__name__,
        base_output_directory=base_output_directory
    )
    damage_metadata.append(result)

# Save metadata to JSON
with open("../data/damage_metadata.json", "w") as f:
    json.dump(damage_metadata, f, indent=2)


Applying ScratchDamage damage to 'your_image'...
Applying WaterDiscolouration damage to 'your_image'...
Applying CraquelureDamage damage to 'your_image'...
Applying NoDamage damage to 'your_image'...


## Generate 5 random damaged paintings from the dataset

This script applies a randomly chosen type of damage—such as scratches, water discoloration, or craquelure—to a fixed random sample of images from a directory. A set random seed ensures the same images are selected each time, while the damage effects remain varied. The script saves each damaged image to an output folder and logs metadata including the original filename, damage type, and file paths. All metadata is saved to a JSON file for easy tracking or later use in tasks like training or evaluating image restoration models.

In [5]:
# Define source and output directories
image_directory = "../data/img/full"
base_output_directory = "../data/sample-damaged-and-mask-data"
metadata_output_path = "../data/damage_metadata.json"

# Collect all image paths from the directory
valid_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')
image_paths = [
    os.path.join(image_directory, fname)
    for fname in os.listdir(image_directory)
    if fname.lower().endswith(valid_extensions)
]

# Set a separate seed for sampling
sampling_rng = random.Random(42)

# Sample image paths reproducibly
image_paths = sampling_rng.sample(image_paths, k=5)

# Define damage instances
damage_classes = [ScratchDamage, WaterDiscolouration, CraquelureDamage]

# Metadata collector
damage_metadata = []

# Apply random damage to each image
for img_path in image_paths:
    try:
        original_image = Image.open(img_path)
        damage_class = random.choice(damage_classes)
        damage_instance = damage_class()

        result = apply_and_save_damage_set(
            original_image_pil=original_image,
            image_identifier=os.path.splitext(os.path.basename(img_path))[0],
            damage_applicator_instance=damage_instance,
            damage_type_name=damage_class.__name__,
            base_output_directory=base_output_directory
        )

        damage_metadata.append(result)

    except Exception as e:
        print(f"Error processing '{img_path}': {e}")

# Save metadata to JSON
with open(metadata_output_path, "w") as f:
    json.dump(damage_metadata, f, indent=2)


Applying CraquelureDamage damage to '2272e22dcc68d4482b697b4483f9b82f79da494b'...
Applying CraquelureDamage damage to '4ab128c6628cc70c24da63f736a14194a058b923'...
Applying ScratchDamage damage to 'd73cb8d6a2be216deb26aa3107defdb9e01620f7'...
Applying CraquelureDamage damage to 'ca7b1c5efc57c3c978be8365c54ab4e54b8e4723'...
Applying ScratchDamage damage to '43ecd8a9fe1d83d96ad40ab59ac44127cffd38f5'...


---

# Generating the Actual Dataset

**PLEASE RUN THIS IN THE SEPARATE NOTEBOOK PROVIDED: notebooks/generate-damage-mask-dataset.ipynb**

This script applies one random, reproducible damage effect (e.g., scratches, water discoloration, or craquelure) to each image in a dataset. It uses multiprocessing to speed up processing across multiple CPU cores, saving both the damaged image and its corresponding damage mask. A metadata file is generated to track the original image, damage type applied, and output file paths.

In [41]:
import os
import sys
project_root = os.path.abspath("..")
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

import json
import random
import hashlib
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
from PIL import Image, ImageDraw, ImageFilter
from tqdm import tqdm

# Import damage classes
from maskerada import ScratchDamage, WaterDiscolouration, CraquelureDamage, NoDamage

In [42]:
# --- Utility: Generate a reproducible seed per image path ---
def get_deterministic_seed(image_path):
    """
    Generates a reproducible integer seed from the image path.

    This ensures consistent random behavior per image across runs,
    even when processed in parallel.

    Args:
        image_path (str): Full path to the image.

    Returns:
        int: A 32-bit integer seed derived from the image path.
    """
    return int(hashlib.sha256(image_path.encode()).hexdigest(), 16) % (2**32)

In [43]:
def process_image(args):
    """
    Processes a single image by applying one randomly selected damage type.
    Skips processing if output files already exist. Returns metadata if successful.

    Args:
        args (tuple): Contains:
            - image_path (str): Path to the input image.
            - base_output_directory (str): Where to save results.
            - damage_classes (list): List of available damage classes.

    Returns:
        dict or None: Metadata dictionary if processed, None if skipped or errored.
    """
    image_path, base_output_directory, damage_classes = args

    try:
        seed = int(hashlib.sha256(image_path.encode()).hexdigest(), 16) % (2**32)
        random.seed(seed)

        damage_class = random.choice(damage_classes)
        damage_type_name = damage_class.__name__

        image_identifier = os.path.splitext(os.path.basename(image_path))[0]
        original_image_filename = os.path.basename(image_path)
        damage_type_safe = "".join(c if c.isalnum() else "_" for c in damage_type_name)

        base_filename = f"{image_identifier}-{damage_type_safe}"
        damaged_output_dir = os.path.join(base_output_directory, "generated-damaged-images-v2")
        damage_mask_dir = os.path.join(base_output_directory, "generated-damage-masks-v2")

        damaged_image_path = os.path.join(damaged_output_dir, f"{base_filename}.png")
        mask_image_path = os.path.join(damage_mask_dir, f"{base_filename}-mask.png")

        if os.path.exists(damaged_image_path) and os.path.exists(mask_image_path):
            return None  # Skip if already processed

        original_image = Image.open(image_path).convert("RGB")
        damage_instance = damage_class()
        damaged_image, damage_mask = damage_instance.apply(original_image.copy())

        os.makedirs(damaged_output_dir, exist_ok=True)
        os.makedirs(damage_mask_dir, exist_ok=True)

        damaged_image.save(damaged_image_path)
        damage_mask.save(mask_image_path)

        return {
            "image_identifier": image_identifier,
            "original_image_filename": original_image_filename,
            "damage_type": damage_type_name,
            "original_image_path": image_path,
            "damaged_image_path": damaged_image_path,
            "mask_image_path": mask_image_path
        }

    except Exception as e:
        print(f"[ERROR] {original_image_filename}: {e}")
        return None


In [32]:
# def process_all_images(
#     image_directory,
#     base_output_directory,
#     metadata_output_path,
#     damage_classes,
#     sample_size=None,
#     max_workers=None
# ):
#     """
#     Applies one random damage type to each image in the directory using multiprocessing.
#     Skips already-processed images and writes valid JSON array metadata at the end.

#     Args:
#         image_directory (str): Folder of input images.
#         base_output_directory (str): Where to save damaged images and masks.
#         metadata_output_path (str): Path to the output .json metadata file.
#         damage_classes (list): List of available damage class types.
#         sample_size (int, optional): Limit number of images processed.
#         max_workers (int, optional): Max number of parallel workers.

#     Returns:
#         None
#     """
#     valid_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')
#     image_paths = [
#         os.path.join(image_directory, fname)
#         for fname in os.listdir(image_directory)
#         if fname.lower().endswith(valid_extensions)
#     ]

#     if sample_size:
#         rng = random.Random(42)
#         image_paths = rng.sample(image_paths, k=sample_size)

#     all_args = [
#         (image_path, base_output_directory, damage_classes)
#         for image_path in image_paths
#     ]

#     metadata = []

#     with ProcessPoolExecutor(max_workers=max_workers) as executor:
#         futures = {executor.submit(process_image, args): args for args in all_args}
#         for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
#             result = future.result()
#             if result is not None:
#                 metadata.append(result)

#     with open(metadata_output_path, "w") as f:
#         json.dump(metadata, f, indent=2)

#     print(f"\nSaved {len(metadata)} entries to '{metadata_output_path}'.")


In [24]:
# process_all_images(
#     image_directory="../../data/img/full",
#     base_output_directory="../../data/sample-damaged-and-mask-multiprocessing-v2",
#     metadata_output_path="../../data/damage_metadata_v2.json",
#     damage_classes=[ScratchDamage, WaterDiscolouration, CraquelureDamage, NoDamage],
#     sample_size=None,
#     max_workers=4
# )

### Revised Function
Revised `process_all_images()` function in order to generate 1 to 2 randomly chosen damage types.

In [44]:
def process_all_images(
    image_directory,
    base_output_directory,
    metadata_output_path,
    damage_classes,
    sample_size=None,
    max_workers=None
):
    """
    Applies 1 to 2 randomly chosen damage types (including NoDamage) per image using multiprocessing.
    Each image may generate multiple variants. Metadata is saved in a JSON file.

    Args:
        image_directory (str): Folder of input images.
        base_output_directory (str): Where to save damaged images and masks.
        metadata_output_path (str): Path to the output .json metadata file.
        damage_classes (list): List of available damage class types (e.g., [Scratch, Water, NoDamage]).
        sample_size (int, optional): Number of images to process. If None, use all.
        max_workers (int, optional): Max number of parallel workers.

    Returns:
        None
    """
    from concurrent.futures import ProcessPoolExecutor, as_completed
    import json
    from tqdm import tqdm

    valid_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')
    image_paths = [
        os.path.join(image_directory, fname)
        for fname in os.listdir(image_directory)
        if fname.lower().endswith(valid_extensions)
    ]

    if sample_size:
        rng = random.Random(42)
        image_paths = rng.sample(image_paths, k=sample_size)

    all_args = []
    rng = random.Random(42)

    for image_path in image_paths:
        num_variants = rng.choice([1, 2])
        selected_damages = rng.sample(damage_classes, k=num_variants)
        for damage_cls in selected_damages:
            all_args.append((image_path, base_output_directory, [damage_cls]))

    metadata = []

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_image, args): args for args in all_args}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            result = future.result()
            if result is not None:
                metadata.append(result)

    with open(metadata_output_path, "w") as f:
        json.dump(metadata, f, indent=2)

    print(f"\nSaved {len(metadata)} image variants to '{metadata_output_path}'.")

In [None]:
process_all_images(
    image_directory="../../data/img/full",
    base_output_directory="../../data/sample-damaged-and-mask-multiprocessing-v2",
    metadata_output_path="../../data/damage_metadata_v2.json",
    damage_classes=[ScratchDamage, WaterDiscolouration, CraquelureDamage, NoDamage],
    sample_size=None,
    max_workers=None
)