# nnUnetV2 data processing 

In [1]:
import json
import os

In [2]:
# create nnUNet dataset root directories
root_dir = "nnunet_data"
os.makedirs(os.path.join(root_dir, "nnUNet_raw"), exist_ok=True)
os.makedirs(os.path.join(root_dir, "nnUNet_preprocessed"), exist_ok=True)
os.makedirs(os.path.join(root_dir, "nnUNet_results"), exist_ok=True)

In [3]:
# create dataset specific dataset and move data into train and test folders
# train folder contains both
dataset_dir = "Dataset001_PancreasSegClassification"
dataset_root = os.path.join(root_dir, "nnUNet_raw", dataset_dir)
os.makedirs(dataset_root, exist_ok=True)
os.makedirs(os.path.join(dataset_root, "imagesTr"), exist_ok=True)
os.makedirs(os.path.join(dataset_root, "imagesTs"), exist_ok=True)
os.makedirs(os.path.join(dataset_root, "labelsTr"), exist_ok=True)
os.makedirs(os.path.join(dataset_root, "labelsTs"), exist_ok=True)

In [10]:
# record the subtypes before converting file naming
import pandas as pd
labels = {"Photo ID":[], "subtype": [], "split": []}

train_dir = "raw_data/train"
validation_dir = "raw_data/validation"

# subtype 0
for fname in os.listdir(os.path.join(train_dir, 'subtype0')):
    if fname.endswith("_0000.nii.gz"):
        head, subtype, fid, ending = fname.split("_")
        labels['Photo ID'].append('_'.join([head, fid, ending]))
        labels['subtype'].append(0)
        labels['split'].append("train")
for fname in os.listdir(os.path.join(validation_dir, 'subtype0')):
    if fname.endswith("_0000.nii.gz"):
        head, subtype, fid, ending = fname.split("_")
        labels['Photo ID'].append('_'.join([head, fid, ending]))
        labels['subtype'].append(0)
        labels['split'].append("validation")

# subtype 1
for fname in os.listdir(os.path.join(train_dir, 'subtype1')):
    if fname.endswith("_0000.nii.gz"):
        head, subtype, fid, ending = fname.split("_")
        labels['Photo ID'].append('_'.join([head, fid, ending]))
        labels['subtype'].append(1)
        labels['split'].append("train")
for fname in os.listdir(os.path.join(validation_dir, 'subtype1')):
    if fname.endswith("_0000.nii.gz"):
        head, subtype, fid, ending = fname.split("_")
        labels['Photo ID'].append('_'.join([head, fid, ending]))
        labels['subtype'].append(1)
        labels['split'].append("validation")


# subtype 2
for fname in os.listdir(os.path.join(train_dir, 'subtype2')):
    if fname.endswith("_0000.nii.gz"):
        head, subtype, fid, ending = fname.split("_")
        labels['Photo ID'].append('_'.join([head, fid, ending]))
        labels['subtype'].append(2)
        labels['split'].append("train")
for fname in os.listdir(os.path.join(validation_dir, 'subtype2')):
    if fname.endswith("_0000.nii.gz"):
        head, subtype, fid, ending = fname.split("_")
        labels['Photo ID'].append('_'.join([head, fid, ending]))
        labels['subtype'].append(2)
        labels['split'].append("validation")


assert len(labels["Photo ID"]) == 288, "Length of should be the same before and after conversion."

pd.DataFrame(labels).to_csv(os.path.join(dataset_root, "labels.csv"), index=False)
# full_df = pd.DataFrame(labels)
# full_df.where(full_df['split'] == "train").dropna().to_csv(os.path.join(dataset_root, "labels_train.csv"), index=False)
# full_df.where(full_df['split'] == "validation").dropna().to_csv(os.path.join(dataset_root, "labels_validation.csv"), index=False)

In [18]:
# create dataset.json for the model
from nnunetv2.dataset_conversion.generate_dataset_json import generate_dataset_json

generate_dataset_json(
    output_folder=str(dataset_root),
    channel_names={0: "CT"},
    labels={
        'background': 0,
        'pancreas': 1,
        'lesion': 2,
    },
    num_training_cases=252,
    file_ending='.nii.gz',
)

In [19]:
# convertion of the filename and move the files over
import shutil
tr_source_dirs = [os.path.join("raw_data/train", f"subtype{i}") for i in (0,1,2)]
val_source_dirs = [os.path.join("raw_data/validation", f"subtype{i}") for i in (0,1,2)]
tr_images_dir = os.path.join(dataset_root, "imagesTr")
tr_labels_dir = os.path.join(dataset_root, "labelsTr")
val_images_dir = os.path.join(dataset_root, "imagesTs")
val_labels_dir = os.path.join(dataset_root, "labelsTs")

for sd in tr_source_dirs:
    for source_file in os.listdir(sd):
        source_path = os.path.join(sd, source_file)
        if source_file.endswith("_0000.nii.gz"): # image files
            head, subtype, fid, ending = source_file.split("_")
            new_filename = f"{head}_{fid}_{ending}"
            dest_path = os.path.join(tr_images_dir, new_filename)
            shutil.copy2(source_path, dest_path)
        elif source_file.endswith(".nii.gz"): # label files
            head, subtype, ending = source_file.split("_")
            new_filename = f"{head}_{ending}"
            dest_path = os.path.join(tr_labels_dir, new_filename)
            shutil.copy2(source_path, dest_path)

for sd in val_source_dirs:
    for source_file in os.listdir(sd):
        source_path = os.path.join(sd, source_file)
        if source_file.endswith("_0000.nii.gz"): # image files
            head, subtype, fid, ending = source_file.split("_")
            new_filename = f"{head}_{fid}_{ending}"
            dest_path = os.path.join(val_images_dir, new_filename)
            shutil.copy2(source_path, dest_path)
        elif source_file.endswith(".nii.gz"): # label files
            head, subtype, ending = source_file.split("_")
            new_filename = f"{head}_{ending}"
            dest_path = os.path.join(val_labels_dir, new_filename)
            shutil.copy2(source_path, dest_path)


assert len(os.listdir(tr_images_dir)) == 252, "Number of images in imagesTr should be 252."
assert len(os.listdir(tr_labels_dir)) == 252, "Number of labels in labelsTr should be 252."

In [20]:
# make sure the segmentation labels are in correct type
import nibabel as nib

labels_dirs = [tr_labels_dir, val_labels_dir]

for labels_dir in labels_dirs:
    for label_file in os.listdir(labels_dir):
        label_path = os.path.join(labels_dir, label_file)
        img = nib.load(label_path)
        data = img.get_fdata()

        # Check if the data type is not int16
        if data.dtype != np.int16:
            # print(f"Converting {label_file} to int16.")
            data = data.astype(np.int16)
            new_img = nib.Nifti1Image(data, img.affine, img.header)
            new_img.set_data_dtype(np.int16)
            nib.save(new_img, label_path)

# check on label
img = nib.load('/mnt/data/gpu-server/m31_nnUnet/nnunet_data/nnUNet_raw/Dataset001_PancreasSegClassification/labelsTr/quiz_544.nii.gz')
print('Unique values:', np.unique(img.get_fdata()))
print('Data type:', img.get_fdata().dtype)

Unique values: [0. 1. 2.]
Data type: float64


In [22]:
# create the preprocessed dataset
import subprocess

env_vars = os.environ.copy()
# print(root_dir)
env_vars['nnUNet_raw'] = "/mnt/data/gpu-server/m31_nnUnet/nnunet_data/nnUNet_raw"
env_vars['nnUNet_preprocessed'] = "/mnt/data/gpu-server/m31_nnUnet/nnunet_data/nnUNet_preprocessed"
env_vars['nnUNet_results'] = "/mnt/data/gpu-server/m31_nnUnet/nnunet_data/nnUNet_results"



result = subprocess.run([
    "uv", "run", "--extra", "cu124",
    "nnUNetv2_plan_and_preprocess",
    # "-pl", "nnUNetPlannerResEncM", # use without to use default planner
    "-d", "1",
    "-c", "3d_fullres",
    "-npfp", "8",
    "--verify_dataset_integrity"
], env=env_vars, capture_output=True, text=True, check=True)

print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)

STDOUT: Fingerprint extraction...
Dataset001_PancreasSegClassification
Using <class 'nnunetv2.imageio.simpleitk_reader_writer.SimpleITKIO'> as reader/writer

####################
verify_dataset_integrity Done. 
If you didn't see any error messages then your dataset is most likely OK!
####################

Using <class 'nnunetv2.imageio.simpleitk_reader_writer.SimpleITKIO'> as reader/writer
Experiment planning...

############################
INFO: You are using the old nnU-Net default planner. We have updated our recommendations. Please consider using those instead! Read more here: https://github.com/MIC-DKFZ/nnUNet/blob/master/documentation/resenc_presets.md
############################

Dropping 3d_lowres config because the image size difference to 3d_fullres is too small. 3d_fullres: [ 59. 118. 181.], 3d_lowres: [59, 118, 181]
2D U-Net configuration:
{'data_identifier': 'nnUNetPlans_2d', 'preprocessor_name': 'DefaultPreprocessor', 'batch_size': 132, 'patch_size': (np.int64(128), np.

## Special data processing for ResNet Encoder

In [8]:
# create dataset specific dataset and move data into train and test folders
# train folder contains both
# create the preprocessed dataset
import subprocess

env_vars = os.environ.copy()
# print(root_dir)
env_vars['nnUNet_raw'] = "/mnt/data/gpu-server/nnUNet_modified/nnunet_data/nnUNet_raw"
env_vars['nnUNet_preprocessed'] = "/mnt/data/gpu-server/nnUNet_modified/nnunet_data/nnUNet_preprocessed"
env_vars['nnUNet_results'] = "/mnt/data/gpu-server/nnUNet_modified/nnunet_data/nnUNet_results"



result = subprocess.run([
    "uv", "run", "--extra", "cu124",
    "nnUNetv2_plan_and_preprocess",
    "-pl", "nnUNetPlannerResEncM", # use without to use default planner
    "-d", "1",
    "-c", "3d_fullres",
    "-npfp", "8",
    "--verify_dataset_integrity"
], env=env_vars, capture_output=True, text=True, check=True)

print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)

STDOUT: Fingerprint extraction...
Dataset001_PancreasSegClassification
Using <class 'nnunetv2.imageio.simpleitk_reader_writer.SimpleITKIO'> as reader/writer

####################
verify_dataset_integrity Done. 
If you didn't see any error messages then your dataset is most likely OK!
####################

Experiment planning...
Dropping 3d_lowres config because the image size difference to 3d_fullres is too small. 3d_fullres: [ 59. 118. 181.], 3d_lowres: [59, 118, 181]
2D U-Net configuration:
{'data_identifier': 'nnUNetPlans_2d', 'preprocessor_name': 'DefaultPreprocessor', 'batch_size': 134, 'patch_size': (np.int64(128), np.int64(192)), 'median_image_size_in_voxels': array([118., 181.]), 'spacing': array([0.73046875, 0.73046875]), 'normalization_schemes': ['CTNormalization'], 'use_mask_for_norm': [False], 'resampling_fn_data': 'resample_data_or_seg_to_shape', 'resampling_fn_seg': 'resample_data_or_seg_to_shape', 'resampling_fn_data_kwargs': {'is_seg': False, 'order': 3, 'order_z': 0, '

In [11]:
# read one datas sample to see their shape
import nibabel as nib
img = nib.load('/mnt/data/gpu-server/nnUNet_modified/nnunet_data/nnUNet_raw/Dataset001_PancreasSegClassification/imagesTs/quiz_084_0000.nii.gz')
img.slicer[0:1].shape

(1, 113, 110)

In [14]:
img = nib.load('/mnt/data/gpu-server/nnUNet_modified/nnunet_data/nnUNet_raw/Dataset001_PancreasSegClassification/imagesTr/quiz_016_0000.nii.gz')
img.shape

(171, 96, 234)