# Construct and simplify our own dataset

In [1]:
import random
import shutil
import SimpleITK as sitk
from pathlib import Path
from typing import List

from lib.folder import FolderMg

In [2]:
def findDuplicateFileName(file_list: List[Path]):
    file_names = []
    for f in file_list:
        f_name = f.name.split(".")[0]
        if f_name in file_names:
            print(f)
        else:
            file_names.append(f_name)
    print(f"Files should have {len(file_list)}, names have {len(file_names)}")
    return file_names

In [3]:
sourceDataPath = Path(
    "D:\Medical Image - Research\Clinical Data Prostate Segmentation Image Packages\ReviewData"
)
sourceDataMg = FolderMg(sourceDataPath)
sourceDataMg.ls()


Current Folder 'ReviewData' contains 14 folders, which are:
  - DataPack1
  - DataPack2
  - DataPack3
  - DataPack4
  - DataPack5
  - ...

Current Folder 'ReviewData' contains NO files



## Get valid Data
1. according to the segmentation result, remove file with & in the name.
2. find the original file according to valid segmentation result file

In [4]:
validSegmentationFiles = []
validOriginalFiles = []
for d in sourceDataMg.dirs:
    print(d.name)
    tempOgMg = FolderMg(d)
    # tempOgMg.ls()
    tempSegMg = FolderMg(d.joinpath("Segmentation"))
    # tempSegMg.ls()

    useableFile = [f for f in tempSegMg.files if "&" not in f.name]
    for i, segf in enumerate(useableFile):
        segf_name = segf.name.split(".")[0]
        matchOgFiles = []
        for ogf in tempOgMg.files:
            ogf_name = ogf.stem
            if segf_name == ogf_name:
                matchOgFiles.append(ogf)
        # matchOgFiles = [ ogf for ogf in tempOgMg.files if segf_name in ogf.name and ogf.name in segf_name]
        if len(matchOgFiles) != 1:
            print(segf, matchOgFiles)
            break
        validOriginalFiles.extend(matchOgFiles)
    validSegmentationFiles.extend(useableFile)
assert len(validOriginalFiles) == len(validSegmentationFiles)

DataPack1
DataPack2
DataPack3
DataPack4
DataPack5
DataPack6
DataPack7
DataPack8
DataPack9
DataPack10
DataPack11
DataPack12
DataPack13
DataPack14


## Filter out the same size original data

In [5]:
sizeDict = {}
for i, f in enumerate(validOriginalFiles):
    key = f.stat().st_size
    if key in sizeDict:
        duplicatesFiles = sourceDataPath.parent.joinpath("dup", sizeDict[key].stem)
        if not duplicatesFiles.exists():
            Path.mkdir(duplicatesFiles, parents=True)
        shutil.copy2(f, duplicatesFiles)
        shutil.copy2(sizeDict[key], duplicatesFiles)
        # print(key, sizeDict[key], f)
    else:
        sizeDict[key] = f

In [6]:
len(sizeDict)

99

## Shuffle the original data

In [7]:
ogFiles = list(sizeDict.values())
random.shuffle(ogFiles)
print(list(sizeDict.values())[0])
print(ogFiles[0])
print(len(ogFiles))
ogFilesNames = findDuplicateFileName(ogFiles)
ogFilesNames[0]

D:\Medical Image - Research\Clinical Data Prostate Segmentation Image Packages\ReviewData\DataPack1\FollowMR_t2_tse_fdixon_tra_p2_256_in_RD_UT20211224151916.mha
D:\Medical Image - Research\Clinical Data Prostate Segmentation Image Packages\ReviewData\DataPack8\MainMR_t2_tse_fdixon_tra_p2_256_W_RD_UT20211224151916.mha
99
Files should have 99, names have 99


'MainMR_t2_tse_fdixon_tra_p2_256_W_RD_UT20211224151916'

## Find its corrsponding segmentation result given new original data 

In [8]:
ogSegPair = {}
for ogf in ogFiles:
    ogf_name = ogf.name.split(".")[0]
    matchFiles = []
    for segf in validSegmentationFiles:
        segf_name = segf.name.split(".")[0]
        if ogf_name == segf_name:
            matchFiles.append(segf)
            # print(ogf_name, segf_name)
    if len(matchFiles) != 1:
        print(ogf, matchFiles)
        break
    ogSegPair[ogf_name] = [ogf, matchFiles[0]]
len(ogSegPair)

99

## Construct Dataset and save to folder
convert .mha and .nrrd to nii.gz

In [9]:
datasetPath = sourceDataPath.parent.joinpath(f"Dataset078_UltrastProstate")
if not datasetPath.exists():
    datasetPath.mkdir(parents=True)
segmentationPath = datasetPath.joinpath("labels")
if not datasetPath.exists():
    datasetPath.mkdir(parents=True)
if not segmentationPath.exists():
    segmentationPath.mkdir(parents=True)

for k, v in ogSegPair.items():
    ogf, segf = v[0], v[1]
    ogf_name = ogf.name.split(".")[0]
    segf_name = segf.name.split(".")[0]
    if ogf_name != segf_name:
        print(ogf_name, segf_name)
        break
    
    # shutil.copy2(ogf, datasetPath)    
    shutil.copy2(segf, segmentationPath)
    outFileName = datasetPath.joinpath(f"{ogf.stem}.nii.gz")
    img = sitk.ReadImage(ogf)
    sitk.WriteImage(img, fileName=outFileName)
print("Dataset is done")

Dataset is done


## Convert Dataset to nnUNet structure

In [18]:
trainRatio = 0.8
nnUNetDatasetPath = datasetPath.parent.joinpath(datasetPath.name + f"{trainRatio}")
imageTrFolder = nnUNetDatasetPath.joinpath("imagesTr")
imageTsFolder = nnUNetDatasetPath.joinpath("imagesTs")
labelTrFolder = nnUNetDatasetPath.joinpath("labelsTr")
labelTsFolder = nnUNetDatasetPath.joinpath("labelsTs")
if not nnUNetDatasetPath.exists():
    nnUNetDatasetPath.mkdir(parents=True)
    imageTrFolder.mkdir(parents=True)
    imageTsFolder.mkdir(parents=True)
    labelTrFolder.mkdir(parents=True)
    labelTsFolder.mkdir(parents=True)
    
datasetFolderMg = FolderMg(datasetPath)
segmentationMg = FolderMg(segmentationPath)

nFile = datasetFolderMg.nFile
nTrain = int(nFile * 0.8)
for i, (ogf, segf) in enumerate(zip(datasetFolderMg.files, segmentationMg.files)):
    ogf_name = ogf.name.split(".")[0]
    segf_name = segf.name.split(".")[0]
    assert ogf_name == segf_name
    new_ogf_stem = f"Ultrast_{str(i).zfill(3)}_0000"
    new_segf_stem = f"Ultrast_{str(i).zfill(3)}"
    ogf_extension = "".join(ogf.suffixes)
    segf_extension = "".join(segf.suffixes)
    # print(new_ogf_stem, new_segf_stem, ogf_extension, segf_extension, new_ogf, new_segf)
    if i < nTrain:
        new_ogf = imageTrFolder.joinpath(new_ogf_stem + ogf_extension)
        new_segf = labelTrFolder.joinpath(new_segf_stem + segf_extension)
        shutil.copy2(ogf, new_ogf)
        shutil.copy2(segf, new_segf)
    else:
        new_ogf = imageTsFolder.joinpath(new_ogf_stem + ogf_extension)
        new_segf = labelTsFolder.joinpath(new_segf_stem + segf_extension)
        shutil.copy2(ogf, new_ogf)
        shutil.copy2(segf, new_segf)

print(nFile, nTrain)

99 79


## Generate dataset.json

In [19]:
from nnunetv2.dataset_conversion.generate_dataset_json import generate_dataset_json

channel_names = { 0: "T2"}
labels = {'background':0, 'prostate':1}
num_training_cases = 79
file_ending = ".nii.gz"
overwrite_image_reader_writer = "SimpleITKIO"
generate_dataset_json(str(nnUNetDatasetPath), channel_names=channel_names, labels=labels,num_training_cases=num_training_cases,file_ending=file_ending,overwrite_image_reader_writer=overwrite_image_reader_writer)