# Converting the image data to tensors

## Imports and configurations

In [1]:
import os
import cv2
import numpy as np
from tqdm import tqdm
from pathlib import Path
import multiprocessing as mp

## Utility functions (multiprocessing and conversion)

In [2]:
def _process_single_pair(args):
    img_file, image_dir, mask_dir, save_dir, image_size = args
    img_id = Path(img_file).stem
    mask_file = f"{img_id}-mask.png"

    img_path = os.path.join(image_dir, img_file)
    msk_path = os.path.join(mask_dir, mask_file)

    if not os.path.exists(msk_path):
        return False

    image = cv2.imread(img_path)
    mask = cv2.imread(msk_path, cv2.IMREAD_GRAYSCALE)

    if image is None or mask is None or np.max(mask) == 0:
        return False

    try:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, image_size).astype("float32") / 255.0

        mask = cv2.resize(mask, image_size)
        mask = np.where(mask > 0, 1, 0).astype("float32")
        mask = np.expand_dims(mask, axis=-1)

        npz_path = os.path.join(save_dir, f"{img_id}.npz")
        np.savez(npz_path, image=image, mask=mask)
    except Exception:
        return False

    return True

def preprocess_and_save_npz(image_dir, mask_dir, save_dir, image_size=(320, 320)):
    os.makedirs(save_dir, exist_ok=True)
    image_files = sorted(os.listdir(image_dir))

    # Prepare argument tuples
    args = [
        (img_file, image_dir, mask_dir, save_dir, image_size)
        for img_file in image_files
    ]

    # Use multiprocessing pool
    with mp.Pool(processes=mp.cpu_count()) as pool:
        results = list(tqdm(pool.imap(_process_single_pair, args), total=len(args)))

    valid_pairs = sum(results)
    skipped_pairs = len(results) - valid_pairs

    print(f"Saved {valid_pairs} valid image-mask pairs to {save_dir}")
    print(f"Skipped {skipped_pairs} invalid or missing pairs")


## Set directories

In [3]:
DATA_DIR = "../../data/damaged-and-mask-dataset/"

In [4]:
x_train_dir = os.path.join(DATA_DIR, "img-with-val/train/")
y_train_dir = os.path.join(DATA_DIR, "mask-with-val/train/")

x_valid_dir = os.path.join(DATA_DIR, "img-with-val/val/")
y_valid_dir = os.path.join(DATA_DIR, "mask-with-val/val/")

x_test_dir = os.path.join(DATA_DIR, "img-with-val/test/")
y_test_dir = os.path.join(DATA_DIR, "mask-with-val/test/")

## Preprocessing and save loop

In [5]:
# Train set
preprocess_and_save_npz(
    image_dir=x_train_dir,
    mask_dir=y_train_dir,
    save_dir="../../data/preprocessed-npz/train"
)

100%|██████████| 24205/24205 [05:01<00:00, 80.18it/s] 

Saved 24205 valid image-mask pairs to ../../data/preprocessed-npz/train
Skipped 0 invalid or missing pairs





In [6]:
# Validation set
preprocess_and_save_npz(
    image_dir=x_valid_dir,
    mask_dir=y_valid_dir,
    save_dir="../../data/preprocessed-npz/valid"
)

100%|██████████| 5186/5186 [00:25<00:00, 201.32it/s]


Saved 5186 valid image-mask pairs to ../../data/preprocessed-npz/valid
Skipped 0 invalid or missing pairs


In [7]:
# Test set
preprocess_and_save_npz(
    image_dir=x_test_dir,
    mask_dir=y_test_dir,
    save_dir="../../data/preprocessed-npz/test"
)

100%|██████████| 5188/5188 [00:26<00:00, 194.57it/s]

Saved 5188 valid image-mask pairs to ../../data/preprocessed-npz/test
Skipped 0 invalid or missing pairs



