In [None]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import SimpleITK as sitk
from tqdm import tqdm
import joblib

In [None]:
LIDC_PATH   = Path(r"../dataset/LIDC-IDRI")
ANNOT_PATH  = Path(r"../dataset/annotations.csv")
OUTPUT_PATH = Path(r"../dataset")

(OUTPUT_PATH / "images").mkdir(parents=True, exist_ok=True)
(OUTPUT_PATH / "masks").mkdir(parents=True, exist_ok=True)

PATCH_SIZE_MM = 64
PATCH_SIZE = 64

In [None]:
def load_dicom_series_by_uid(root_path, series_uid):
    """Find and load a DICOM series using its SeriesInstanceUID."""
    for p in root_path.rglob('*'):
        if p.is_dir() and series_uid in p.name:
            reader = sitk.ImageSeriesReader()
            dicom_files = reader.GetGDCMSeriesFileNames(str(p))
            reader.SetFileNames(dicom_files)
            img = reader.Execute()
            return img
    return None

def world_to_voxel(world_coord, image):
    """Convert world (x,y,z) → voxel (z,y,x)."""
    origin = np.array(image.GetOrigin())
    spacing = np.array(image.GetSpacing())
    direction = np.array(image.GetDirection()).reshape(3, 3)

    world = np.array(world_coord)
    voxel_float = np.linalg.inv(direction).dot(world - origin) / spacing
    voxel = voxel_float[::-1]  # SITK array order (z,y,x)
    return voxel.astype(float)

def extract_centered_patch(image, center_vox, out_size=64):
    """
    Extract a fixed-size (64x64x64) patch centered at the nodule voxel location,
    padding borders if necessary.
    """
    arr = sitk.GetArrayFromImage(image)  # z,y,x
    cz, cy, cx = center_vox

    half = out_size // 2
    z1, z2 = int(cz - half), int(cz + half)
    y1, y2 = int(cy - half), int(cy + half)
    x1, x2 = int(cx - half), int(cx + half)

    # pad if the cube hits borders
    pad_before = [max(0, -z1), max(0, -y1), max(0, -x1)]
    pad_after  = [max(0, z2 - arr.shape[0]), 
                  max(0, y2 - arr.shape[1]), 
                  max(0, x2 - arr.shape[2])]

    arr_padded = np.pad(
        arr,
        ((pad_before[0], pad_after[0]),
         (pad_before[1], pad_after[1]),
         (pad_before[2], pad_after[2])),
        mode='constant',
        constant_values=-1000    # HU for air
    )

    # shift crop indices after padding
    z1 += pad_before[0]
    z2 += pad_before[0]
    y1 += pad_before[1]
    y2 += pad_before[1]
    x1 += pad_before[2]
    x2 += pad_before[2]

    patch = arr_padded[z1:z2, y1:y2, x1:x2]

    sitk_patch = sitk.GetImageFromArray(patch.astype(np.float32))
    sitk_patch.SetSpacing(image.GetSpacing())
    sitk_patch.SetOrigin(image.GetOrigin())
    sitk_patch.SetDirection(image.GetDirection())

    return sitk_patch

def make_spherical_mask(center_vox, diameter_mm, spacing, out_size=64):
    """Generates a perfectly centered spherical mask in the patch volume."""
    radius_mm = diameter_mm / 2
    radius_vox = radius_mm / np.mean(spacing)

    cz, cy, cx = out_size // 2, out_size // 2, out_size // 2
    zz, yy, xx = np.ogrid[:out_size, :out_size, :out_size]

    dist = (zz - cz)**2 + (yy - cy)**2 + (xx - cx)**2
    mask = (dist <= radius_vox**2).astype(np.uint8)

    mask_img = sitk.GetImageFromArray(mask)
    mask_img.SetSpacing(spacing)
    return mask_img

In [None]:
annotations = pd.read_csv(ANNOT_PATH)
records = []

for idx, row in tqdm(annotations.iterrows(), total=len(annotations)):
    try:
        uid = row["seriesuid"]
        diameter = float(row["diameter_mm"])
        world_coord = (row["coordX"], row["coordY"], row["coordZ"])

        img = load_dicom_series_by_uid(LIDC_PATH, uid)
        if img is None:
            print("Missing:", uid)
            continue

        # HU normalization
        arr = sitk.GetArrayFromImage(img).astype(np.float32)
        arr = np.clip(arr, -1000, 400)
        arr = (arr + 1000) / 1400
        img_norm = sitk.GetImageFromArray(arr)
        img_norm.CopyInformation(img)

        # Convert to voxel
        voxel_center = world_to_voxel(world_coord, img_norm)

        # Extract perfectly-centered 64^3 patch
        patch = extract_centered_patch(img_norm, voxel_center, out_size=PATCH_SIZE)

        # Create correct mask
        mask = make_spherical_mask(
            voxel_center, diameter_mm=diameter,
            spacing=img_norm.GetSpacing(),
            out_size=PATCH_SIZE
        )

        # save
        img_name = f"{uid.replace('.','_')}_{idx:04d}.nii.gz"
        mask_name = f"{uid.replace('.','_')}_{idx:04d}_mask.nii.gz"

        img_path = OUTPUT_PATH / "images" / img_name
        mask_path = OUTPUT_PATH / "masks" / mask_name

        sitk.WriteImage(patch, str(img_path))
        sitk.WriteImage(mask, str(mask_path))

        records.append({
            "uid": uid,
            "image": str(img_path),
            "mask": str(mask_path),
            "diameter_mm": diameter
        })
    except Exception as e:
        print(e)
        continue

pd.DataFrame(records).to_csv(OUTPUT_PATH / "metadata.csv", index=False)
print("DONE. Saved:", len(records))

 64%|██████▍   | 762/1186 [3:35:18<1:51:18, 15.75s/it] 

Exception thrown in SimpleITK ImageSeriesReader_Execute: D:\a\SimpleITK\SimpleITK\Code\IO\src\sitkImageSeriesReader.cxx:206:
sitk::ERROR: File names information is empty. Cannot read series.


100%|██████████| 1186/1186 [5:30:35<00:00, 16.72s/it]  

DONE. Saved: 1185



