In [None]:
import os
import cv2
import numpy as np

In [None]:
imgs_train_path = "/path/to/OCDC/train/images"
lbls_train_path = "/path/to/OCDC/train/masks"

imgs_test_path  = "/path/to/OCDC/test/images"
lbls_test_path  = "/path/to/OCDC/test/masks"

OUT_DIR         = "./preprocessed_np"
TARGET_SIZE     = (512, 512) 

In [None]:
VALID_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"}

def norm_image(x_bgr: np.ndarray) -> np.ndarray:
    """Min-max normalize to [0,1] in float32."""
    return cv2.normalize(x_bgr, None, alpha=0, beta=1,
                         norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)

def list_images(folder: str):
    files = []
    for f in os.listdir(folder):
        ext = os.path.splitext(f)[1].lower()
        if ext in VALID_EXTS:
            files.append(f)
    return sorted(files)

def build_mask_lookup(mask_dir: str):
    """Return dict: stem -> full mask path (handles different extensions)."""
    lookup = {}
    for f in list_images(mask_dir):
        stem = os.path.splitext(f)[0]
        lookup[stem] = os.path.join(mask_dir, f)
    return lookup

def load_pairs(img_dir: str, mask_dir: str):
    """
    Load (image,mask) pairs matched by filename stem.
    Returns arrays x (N,H,W,3) float32 in [0,1] and y (N,H,W,1) float32 in {0,1}.
    """
    img_files = list_images(img_dir)
    mask_lookup = build_mask_lookup(mask_dir)

    x_list, y_list = [], []
    missing = 0

    for img_name in img_files:
        stem, _ = os.path.splitext(img_name)
        if stem not in mask_lookup:
            missing += 1
            continue

        # read image (BGR), mask (grayscale)
        img_path = os.path.join(img_dir, img_name)
        msk_path = mask_lookup[stem]

        img = cv2.imread(img_path, cv2.IMREAD_COLOR)
        msk = cv2.imread(msk_path, cv2.IMREAD_GRAYSCALE)

        if img is None or msk is None:
            missing += 1
            continue

        # resize
        img = cv2.resize(img, TARGET_SIZE, interpolation=cv2.INTER_AREA)
        msk = cv2.resize(msk, TARGET_SIZE, interpolation=cv2.INTER_NEAREST)

        # normalize image to [0,1]
        img = norm_image(img)                       # (H,W,3) float32 in [0,1]

        # binarize mask to {0,1}
        # handles 0/255 or any thresholded label map
        msk = (msk >= 1).astype(np.float32)         # (H,W)
        msk = np.expand_dims(msk, axis=-1)          # (H,W,1)

        x_list.append(img)
        y_list.append(msk)

    if missing > 0:
        print(f"Warning: skipped {missing} files with missing/corrupt masks/images.")

    x = np.stack(x_list, axis=0).astype(np.float32)  # (N,H,W,3)
    y = np.stack(y_list, axis=0).astype(np.float32)  # (N,H,W,1)
    return x, y


In [None]:
if __name__ == "__main__":
    os.makedirs(OUT_DIR, exist_ok=True)

    print("Loading TRAIN…")
    x_train, y_train = load_pairs(imgs_train_path, lbls_train_path)
    print("Loading TEST…")
    x_test,  y_test  = load_pairs(imgs_test_path,  lbls_test_path)

    print("Shapes:")
    print("  x_train:", x_train.shape, x_train.dtype, f"[{x_train.min():.3f},{x_train.max():.3f}]")
    print("  y_train:", y_train.shape, y_train.dtype, f"unique={np.unique(y_train).tolist()}")
    print("  x_test :", x_test.shape,  x_test.dtype,  f"[{x_test.min():.3f},{x_test.max():.3f}]")
    print("  y_test :", y_test.shape,  y_test.dtype,  f"unique={np.unique(y_test).tolist()}")

    # save individual .npy files
    np.save(os.path.join(OUT_DIR, "x_train.npy"), x_train)
    np.save(os.path.join(OUT_DIR, "y_train.npy"), y_train)
    np.save(os.path.join(OUT_DIR, "x_test.npy"),  x_test)
    np.save(os.path.join(OUT_DIR, "y_test.npy"),  y_test)


    print(f"Saved arrays to: {OUT_DIR}")