## **Enviroment Configuration**
Imports all required libraries and sets paths / global options. Adjust BASE_DIR, INPUT_DIR, and PREPARED_DIR to match your project.

In [6]:
import os
import io
import glob
import random
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image

## **Image corruption helpers (noise, low-res, grayscale, inpainting)**

Defines functions to:

* Add Gaussian noise (denoising task)

* Make a low-resolution version (super-resolution task)

* Convert to grayscale (colorization task)

* Apply random white rectangles + generate a binary mask (inpainting)

In [None]:
# ============== helpers for image ops ==============
def save_image(img: Image.Image, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    img.save(path)


def add_gaussian_noise(img: Image.Image, sigma: float = 0.1) -> Image.Image:
    """Add Gaussian noise in [0,1] range with std=sigma."""
    arr = np.array(img).astype(np.float32) / 255.0
    noise = np.random.normal(0.0, sigma, arr.shape).astype(np.float32)
    noisy = np.clip(arr + noise, 0.0, 1.0)
    noisy_uint8 = (noisy * 255).astype(np.uint8)
    return Image.fromarray(noisy_uint8)


def make_low_resolution(img: Image.Image, scale: int = 4) -> Image.Image:
    """Downsample by 'scale' and upsample back to original size."""
    w, h = img.size
    small = img.resize((max(1, w // scale), max(1, h // scale)), Image.BICUBIC)
    low_res_back = small.resize((w, h), Image.BICUBIC)
    return low_res_back


def create_random_box_mask(
    img: Image.Image,
    num_boxes: int = 3,
    box_size_range=(0.1, 0.4),
) -> Image.Image:
    """
    Create a mask: white (255) where we want to inpaint, black(0) elsewhere.
    """
    w, h = img.size
    mask = np.zeros((h, w), dtype=np.uint8)

    for _ in range(num_boxes):
        box_w = int(random.uniform(*box_size_range) * w)
        box_h = int(random.uniform(*box_size_range) * h)

        x1 = random.randint(0, max(0, w - box_w))
        y1 = random.randint(0, max(0, h - box_h))
        x2 = min(w, x1 + box_w)
        y2 = min(h, y1 + box_h)

        mask[y1:y2, x1:x2] = 255

    return Image.fromarray(mask, mode="L")


def apply_mask_to_image(
    img: Image.Image,
    mask: Image.Image,
    fill_value=(255, 255, 255),
) -> Image.Image:
    """Overwrite masked regions with a solid color to create 'damaged' image."""
    img_arr = np.array(img).copy()
    mask_arr = np.array(mask)

    if mask_arr.ndim == 3:
        mask_arr = mask_arr[..., 0]

    damaged = img_arr.copy()
    damaged[mask_arr == 255] = np.array(fill_value, dtype=np.uint8)
    return Image.fromarray(damaged)


# Convert common parquet schema format to png

Defines load_image_from_row(row) to match your parquet schema.

Supported Schemas:

* image_bytes column (raw bytes)

* image_path column (string path)

* image column that is already PIL or bytes

In [8]:
def load_image_from_row(row) -> Image.Image:
    """
    Try to reconstruct an image from a parquet row.

    Supports a few common patterns:

    - Hugging Face style: 'image' column:
        * dict with 'bytes' and/or 'path'
        * raw bytes
        * str path
        * PIL.Image.Image
    - Fallback: explicit 'image_bytes' or 'image_path' columns
    """
    # --- 1) Hugging Face-style 'image' column ---
    if "image" in row:
        val = row["image"]

        # Already a PIL image
        if isinstance(val, Image.Image):
            return val.convert("RGB")

        # Dict with 'bytes' or 'path'
        if isinstance(val, dict):
            if "bytes" in val and val["bytes"] is not None:
                b = val["bytes"]
                if not isinstance(b, (bytes, bytearray)):
                    b = bytes(b)
                return Image.open(io.BytesIO(b)).convert("RGB")
            if "path" in val and val["path"]:
                return Image.open(val["path"]).convert("RGB")

        # Raw bytes
        if isinstance(val, (bytes, bytearray)):
            return Image.open(io.BytesIO(val)).convert("RGB")

        # String path
        if isinstance(val, str):
            return Image.open(val).convert("RGB")

    # --- 2) Explicit 'image_bytes' column ---
    if "image_bytes" in row and row["image_bytes"] is not None:
        b = row["image_bytes"]
        if not isinstance(b, (bytes, bytearray)):
            b = bytes(b)
        return Image.open(io.BytesIO(b)).convert("RGB")

    # --- 3) Explicit 'image_path' column ---
    if "image_path" in row and row["image_path"]:
        return Image.open(row["image_path"]).convert("RGB")

    # If we got here, schema doesn't match any of the above
    raise ValueError(
        "Row does not contain a recognizable image field "
        "('image', 'image_bytes', or 'image_path')."
    )


In [9]:
def process_parquet_file(
    parquet_path: Path,
    out_base: Path,
):
    print(f"Processing {parquet_path}...")
    df = pd.read_parquet(parquet_path)

    # Prepare output dirs
    orig_dir = out_base / "original"
    denoise_in_dir = out_base / "denoising" / "input"
    sr_in_dir = out_base / "super_resolution" / "input"
    inpaint_in_dir = out_base / "inpainting" / "input"
    inpaint_mask_dir = out_base / "inpainting" / "mask"

    for out_dir in [orig_dir, denoise_in_dir, sr_in_dir, inpaint_in_dir, inpaint_mask_dir]:
        out_dir.mkdir(parents=True, exist_ok=True)

    shard_name = parquet_path.stem  # e.g. "test-00000-of-00028"

    for idx, row in df.iterrows():
        # Build a unique image id using shard + row index
        image_id = f"{shard_name}_{idx:06d}"
        fname = f"{image_id}.png"

        try:
            img = load_image_from_row(row)
        except Exception as e:
            print(f"  [WARN] Skipping row {idx}: {e}")
            continue

        # Save original
        save_image(img, orig_dir / fname)

        # 1) Noisy version (for denoising)
        noisy = add_gaussian_noise(img, sigma=0.1)
        save_image(noisy, denoise_in_dir / fname)

        # 2) Low-res version (for super-resolution)
        low_res = make_low_resolution(img, scale=4)
        save_image(low_res, sr_in_dir / fname)

        # 3) Damaged + mask (for inpainting)
        mask = create_random_box_mask(img, num_boxes=3)
        damaged = apply_mask_to_image(img, mask, fill_value=(255, 255, 255))
        save_image(damaged, inpaint_in_dir / fname)
        save_image(mask, inpaint_mask_dir / fname)

    print(f"Finished {parquet_path}.")

In [None]:
random.seed(42)
np.random.seed(42)

base_dir = Path(".")          # change if needed
input_dir = base_dir / "input"
out_base = base_dir / "prepared"

parquet_files = sorted(glob.glob(str(input_dir / "*.parquet")))
if not parquet_files:
    print(f"No parquet files found in {input_dir}")
else:
    for pf in parquet_files:
        process_parquet_file(Path(pf), out_base)

    print("All parquet files processed.")

Processing input\test-00000-of-00028.parquet...
Finished input\test-00000-of-00028.parquet.
Processing input\test-00001-of-00028.parquet...
