# Converting the image data to np arrays

## Imports and configurations

In [1]:
from pathlib import Path
import os
import glob
import cv2
import numpy as np
from tqdm import tqdm
import multiprocessing as mp
import json

## Utility functions (multiprocessing and conversion)

In [2]:
def _process_single_pair(args):
    img_file, image_dir, mask_dir, save_dir, image_size = args
    img_id = Path(img_file).stem
    mask_file = f"{img_id}-mask.png"

    img_path = os.path.join(image_dir, img_file)
    msk_path = os.path.join(mask_dir, mask_file)

    if not os.path.exists(msk_path):
        print(f"Missing mask for image: {img_file}")
        return False

    image = cv2.imread(img_path)
    mask = cv2.imread(msk_path, cv2.IMREAD_GRAYSCALE)

    if image is None:
        print(f"Failed to read image: {img_path}")
        return False
    if mask is None:
        print(f"Failed to read mask: {msk_path}")
        return False

    try:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, image_size).astype("float32") / 255.0

        mask = cv2.resize(mask, image_size)
        mask = np.where(mask > 0, 1, 0).astype("float32")
        mask = np.expand_dims(mask, axis=-1)

        npz_path = os.path.join(save_dir, f"{img_id}.npz")
        np.savez(npz_path, image=image, mask=mask)
    except Exception as e:
        print(f"Error processing {img_id}: {e}")
        return False

    return True


def preprocess_and_save_npz(image_dir, mask_dir, save_dir, image_size=(320, 320)):
    os.makedirs(save_dir, exist_ok=True)
    image_files = sorted(os.listdir(image_dir))

    args = [
        (img_file, image_dir, mask_dir, save_dir, image_size)
        for img_file in image_files
    ]

    with mp.Pool(processes=mp.cpu_count()) as pool:
        results = list(tqdm(pool.imap(_process_single_pair, args), total=len(args)))

    valid_pairs = sum(results)
    skipped_pairs = len(results) - valid_pairs

    print(f"Saved {valid_pairs} valid image-mask pairs to {save_dir}")
    print(f"Skipped {skipped_pairs} invalid or missing pairs")

## Set directories

In [3]:
DATA_DIR = "../../data/v6-split-dataset/"

In [4]:
x_train_dir = os.path.join(DATA_DIR, "train/img")
y_train_dir = os.path.join(DATA_DIR, "train/mask")

x_valid_dir = os.path.join(DATA_DIR, "val/img")
y_valid_dir = os.path.join(DATA_DIR, "val/mask")

x_test_dir = os.path.join(DATA_DIR, "test/img")
y_test_dir = os.path.join(DATA_DIR, "test/mask")

## Preprocessing and save loop

In [5]:
# Train set
preprocess_and_save_npz(
    image_dir=x_train_dir,
    mask_dir=y_train_dir,
    save_dir="../../data/v6-preprocessed-npz/train"
)

100%|██████████| 24206/24206 [08:37<00:00, 46.82it/s]


Saved 24206 valid image-mask pairs to ../../data/v6-preprocessed-npz/train
Skipped 0 invalid or missing pairs


In [6]:
# Validation set
preprocess_and_save_npz(
    image_dir=x_valid_dir,
    mask_dir=y_valid_dir,
    save_dir="../../data/v6-preprocessed-npz/valid"
)

100%|██████████| 5187/5187 [01:43<00:00, 50.12it/s]

Saved 5187 valid image-mask pairs to ../../data/v6-preprocessed-npz/valid
Skipped 0 invalid or missing pairs





In [7]:
# Test set
preprocess_and_save_npz(
    image_dir=x_test_dir,
    mask_dir=y_test_dir,
    save_dir="../../data/v6-preprocessed-npz/test"
)

100%|██████████| 5187/5187 [01:44<00:00, 49.60it/s]

Saved 5187 valid image-mask pairs to ../../data/v6-preprocessed-npz/test
Skipped 0 invalid or missing pairs



