In [1]:
import os
import shutil
import numpy as np

# 指定原始数据集和目标数据集的路径
src_dataset = "dataset"
dst_dataset = "dataset_total"

# 定义子数据集的顺序
order_of_datasets = ["1.5T-GE", "1.5T-Phillips", "3T-Achieva", "3T-Ingenia"]

# 在目标数据集目录下创建 imagesTr 和 labelsTr 文件夹
os.makedirs(os.path.join(dst_dataset, "imagesTr"), exist_ok=True)
os.makedirs(os.path.join(dst_dataset, "labelsTr"), exist_ok=True)

# 初始化计数器
counter = 1

# 初始化分层抽样的结果
stratified_samples = {i: [] for i in range(5)}  # 5 folds

# 按照指定的顺序遍历子数据集
for dataset in order_of_datasets:
    # 获取子数据集中的 imagesTr 和 labelsTr 路径
    images_path = os.path.join(src_dataset, dataset, "imagesTr")
    labels_path = os.path.join(src_dataset, dataset, "labelsTr")

    # 获取 imagesTr 和 labelsTr 中的文件列表
    images_files = sorted(os.listdir(images_path))
    labels_files = sorted(os.listdir(labels_path))

    # 为每个子数据集中的文件生成一个唯一的编号
    ids = np.arange(len(images_files)) + counter

    # 在每个子数据集中进行分层抽样
    fold_ids = np.mod(ids, 5)  # 5 folds

    # 根据抽样结果更新计数器和分层抽样的结果
    for i, (image_file, label_file) in enumerate(zip(images_files, labels_files)):
        # 指定目标文件名
        dst_image_file = f"data_{ids[i]:03}_0000.nii.gz"
        dst_label_file = f"data_{ids[i]:03}.nii.gz"

        # 使用 shutil.copy() 方法将文件复制到新的目录，并重命名
        shutil.copy(os.path.join(images_path, image_file), os.path.join(dst_dataset, "imagesTr", dst_image_file))
        shutil.copy(os.path.join(labels_path, label_file), os.path.join(dst_dataset, "labelsTr", dst_label_file))

        # 更新分层抽样的结果
        stratified_samples[fold_ids[i]].append((dst_image_file, dst_label_file))

    # 更新计数器
    counter += len(images_files)

# 输出分层抽样的结果
for i, samples in stratified_samples.items():
    print(f"Fold {i}:")
    for image_file, label_file in samples:
        print(f"    {image_file}, {label_file}")


Fold 0:
    data_005_0000.nii.gz, data_005.nii.gz
    data_010_0000.nii.gz, data_010.nii.gz
    data_015_0000.nii.gz, data_015.nii.gz
    data_020_0000.nii.gz, data_020.nii.gz
    data_025_0000.nii.gz, data_025.nii.gz
    data_030_0000.nii.gz, data_030.nii.gz
    data_035_0000.nii.gz, data_035.nii.gz
    data_040_0000.nii.gz, data_040.nii.gz
    data_045_0000.nii.gz, data_045.nii.gz
    data_050_0000.nii.gz, data_050.nii.gz
    data_055_0000.nii.gz, data_055.nii.gz
    data_060_0000.nii.gz, data_060.nii.gz
    data_065_0000.nii.gz, data_065.nii.gz
    data_070_0000.nii.gz, data_070.nii.gz
    data_075_0000.nii.gz, data_075.nii.gz
    data_080_0000.nii.gz, data_080.nii.gz
    data_085_0000.nii.gz, data_085.nii.gz
    data_090_0000.nii.gz, data_090.nii.gz
    data_095_0000.nii.gz, data_095.nii.gz
    data_100_0000.nii.gz, data_100.nii.gz
    data_105_0000.nii.gz, data_105.nii.gz
    data_110_0000.nii.gz, data_110.nii.gz
    data_115_0000.nii.gz, data_115.nii.gz
    data_120_0000.nii.gz, 

In [4]:
for i, samples in stratified_samples.items():
    print(f"Fold {i}:")
    # print samples size
    print(f"    {len(samples)}")
    for image_file, label_file in samples:
        print(f"    {image_file}, {label_file}")

Fold 0:
    59
    data_005_0000.nii.gz, data_005.nii.gz
    data_010_0000.nii.gz, data_010.nii.gz
    data_015_0000.nii.gz, data_015.nii.gz
    data_020_0000.nii.gz, data_020.nii.gz
    data_025_0000.nii.gz, data_025.nii.gz
    data_030_0000.nii.gz, data_030.nii.gz
    data_035_0000.nii.gz, data_035.nii.gz
    data_040_0000.nii.gz, data_040.nii.gz
    data_045_0000.nii.gz, data_045.nii.gz
    data_050_0000.nii.gz, data_050.nii.gz
    data_055_0000.nii.gz, data_055.nii.gz
    data_060_0000.nii.gz, data_060.nii.gz
    data_065_0000.nii.gz, data_065.nii.gz
    data_070_0000.nii.gz, data_070.nii.gz
    data_075_0000.nii.gz, data_075.nii.gz
    data_080_0000.nii.gz, data_080.nii.gz
    data_085_0000.nii.gz, data_085.nii.gz
    data_090_0000.nii.gz, data_090.nii.gz
    data_095_0000.nii.gz, data_095.nii.gz
    data_100_0000.nii.gz, data_100.nii.gz
    data_105_0000.nii.gz, data_105.nii.gz
    data_110_0000.nii.gz, data_110.nii.gz
    data_115_0000.nii.gz, data_115.nii.gz
    data_120_0000.n

In [7]:
import json

# 初始化splits
splits = []

# 遍历分层抽样的结果
for i in range(5):
    # 获取当前fold的所有样本
    samples = stratified_samples[i]

    # 所有其他的fold将作为训练数据
    train_samples = [sample for j in range(5) if j != i for sample in stratified_samples[j]]

    # 提取训练集和验证集的标识符
    train_ids = [label_file.replace(".nii.gz", "") for _, label_file in train_samples]
    val_ids = [label_file.replace(".nii.gz", "") for _, label_file in samples]

    # 添加到splits
    splits.append({
        'train': train_ids,
        'val': val_ids,
    })

# 将splits保存到splits_final.json文件
with open('splits_final.json', 'w') as f:
    json.dump(splits, f, indent=4)
