In [1]:
%load_ext autoreload
%autoreload 2

In [17]:
import random
import tqdm
import shutil
import SimpleITK as sitk
from pathlib import Path
from typing import List

from lib.folder.basic import FolderMg

In [18]:
def findDuplicateFileName(file_list: List[Path]):
    file_names = []
    for f in file_list:
        f_name = f.name.split(".")[0]
        if f_name in file_names:
            print(f)
        else:
            file_names.append(f_name)
    print(f"Files should have {len(file_list)}, names have {len(file_names)}")
    return file_names

# Construct Prostate Dataset

In [3]:
sourceDataPath = Path(
    "D:\Medical Image - Research\Clinical Data Prostate Segmentation Image Packages\ReviewData"
)
sourceDataMg = FolderMg(sourceDataPath)
sourceDataMg.ls()

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'D:\\Medical Image - Research\\Clinical Data Prostate Segmentation Image Packages\\ReviewData'

## Get valid Data
1. according to the segmentation result, remove file with & in the name.
2. find the original file according to valid segmentation result file

In [None]:
validSegmentationFiles = []
validOriginalFiles = []
for d in sourceDataMg.dirs:
    print(d.name)
    tempOgMg = FolderMg(d)
    # tempOgMg.ls()
    tempSegMg = FolderMg(d.joinpath("Segmentation"))
    # tempSegMg.ls()

    useableFile = [f for f in tempSegMg.files if "&" not in f.name]
    for i, segf in enumerate(useableFile):
        segf_name = segf.name.split(".")[0]
        matchOgFiles = []
        for ogf in tempOgMg.files:
            ogf_name = ogf.stem
            if segf_name == ogf_name:
                matchOgFiles.append(ogf)
        # matchOgFiles = [ ogf for ogf in tempOgMg.files if segf_name in ogf.name and ogf.name in segf_name]
        if len(matchOgFiles) != 1:
            print(segf, matchOgFiles)
            break
        validOriginalFiles.extend(matchOgFiles)
    validSegmentationFiles.extend(useableFile)
assert len(validOriginalFiles) == len(validSegmentationFiles)

DataPack1
DataPack2
DataPack3
DataPack4
DataPack5
DataPack6
DataPack7
DataPack8
DataPack9
DataPack10
DataPack11
DataPack12
DataPack13
DataPack14


## Filter out the same size original data

In [None]:
sizeDict = {}
for i, f in enumerate(validOriginalFiles):
    key = f.stat().st_size
    if key in sizeDict:
        duplicatesFiles = sourceDataPath.parent.joinpath("dup", sizeDict[key].stem)
        if not duplicatesFiles.exists():
            Path.mkdir(duplicatesFiles, parents=True)
        shutil.copy2(f, duplicatesFiles)
        shutil.copy2(sizeDict[key], duplicatesFiles)
        # print(key, sizeDict[key], f)
    else:
        sizeDict[key] = f

In [None]:
len(sizeDict)

99

## Shuffle the original data

In [None]:
ogFiles = list(sizeDict.values())
random.shuffle(ogFiles)
print(list(sizeDict.values())[0])
print(ogFiles[0])
print(len(ogFiles))
ogFilesNames = findDuplicateFileName(ogFiles)
ogFilesNames[0]

D:\Medical Image - Research\Clinical Data Prostate Segmentation Image Packages\ReviewData\DataPack1\FollowMR_t2_tse_fdixon_tra_p2_256_in_RD_UT20211224151916.mha
D:\Medical Image - Research\Clinical Data Prostate Segmentation Image Packages\ReviewData\DataPack8\MainMR_t2_tse_fdixon_tra_p2_256_W_RD_UT20211224151916.mha
99
Files should have 99, names have 99


'MainMR_t2_tse_fdixon_tra_p2_256_W_RD_UT20211224151916'

## Find its corrsponding segmentation result given new original data 

In [None]:
ogSegPair = {}
for ogf in ogFiles:
    ogf_name = ogf.name.split(".")[0]
    matchFiles = []
    for segf in validSegmentationFiles:
        segf_name = segf.name.split(".")[0]
        if ogf_name == segf_name:
            matchFiles.append(segf)
            # print(ogf_name, segf_name)
    if len(matchFiles) != 1:
        print(ogf, matchFiles)
        break
    ogSegPair[ogf_name] = [ogf, matchFiles[0]]
len(ogSegPair)

99

## Construct Dataset and save to folder
convert .mha and .nrrd to nii.gz

In [None]:
datasetPath = sourceDataPath.parent.joinpath(f"Dataset078_UltrastProstate")
if not datasetPath.exists():
    datasetPath.mkdir(parents=True)
segmentationPath = datasetPath.joinpath("labels")
if not datasetPath.exists():
    datasetPath.mkdir(parents=True)
if not segmentationPath.exists():
    segmentationPath.mkdir(parents=True)

for k, v in ogSegPair.items():
    ogf, segf = v[0], v[1]
    ogf_name = ogf.name.split(".")[0]
    segf_name = segf.name.split(".")[0]
    if ogf_name != segf_name:
        print(ogf_name, segf_name)
        break
    
    # shutil.copy2(ogf, datasetPath)    
    shutil.copy2(segf, segmentationPath)
    outFileName = datasetPath.joinpath(f"{ogf.stem}.nii.gz")
    img = sitk.ReadImage(ogf)
    sitk.WriteImage(img, fileName=outFileName)
print("Dataset is done")

Dataset is done


## Convert Dataset to nnUNet structure

In [None]:
trainRatio = 0.8
nnUNetDatasetPath = datasetPath.parent.joinpath(datasetPath.name + f"{trainRatio}")
imageTrFolder = nnUNetDatasetPath.joinpath("imagesTr")
imageTsFolder = nnUNetDatasetPath.joinpath("imagesTs")
labelTrFolder = nnUNetDatasetPath.joinpath("labelsTr")
labelTsFolder = nnUNetDatasetPath.joinpath("labelsTs")
if not nnUNetDatasetPath.exists():
    nnUNetDatasetPath.mkdir(parents=True)
    imageTrFolder.mkdir(parents=True)
    imageTsFolder.mkdir(parents=True)
    labelTrFolder.mkdir(parents=True)
    labelTsFolder.mkdir(parents=True)
    
datasetFolderMg = FolderMg(datasetPath)
segmentationMg = FolderMg(segmentationPath)

nFile = datasetFolderMg.nFile
nTrain = int(nFile * 0.8)
for i, (ogf, segf) in enumerate(zip(datasetFolderMg.files, segmentationMg.files)):
    ogf_name = ogf.name.split(".")[0]
    segf_name = segf.name.split(".")[0]
    assert ogf_name == segf_name
    new_ogf_stem = f"Ultrast_{str(i).zfill(3)}_0000"
    new_segf_stem = f"Ultrast_{str(i).zfill(3)}"
    ogf_extension = "".join(ogf.suffixes)
    segf_extension = "".join(segf.suffixes)
    # print(new_ogf_stem, new_segf_stem, ogf_extension, segf_extension, new_ogf, new_segf)
    if i < nTrain:
        new_ogf = imageTrFolder.joinpath(new_ogf_stem + ogf_extension)
        new_segf = labelTrFolder.joinpath(new_segf_stem + segf_extension)
        shutil.copy2(ogf, new_ogf)
        shutil.copy2(segf, new_segf)
    else:
        new_ogf = imageTsFolder.joinpath(new_ogf_stem + ogf_extension)
        new_segf = labelTsFolder.joinpath(new_segf_stem + segf_extension)
        shutil.copy2(ogf, new_ogf)
        shutil.copy2(segf, new_segf)

print(nFile, nTrain)

99 79


## Generate dataset.json

In [None]:
from nnunetv2.dataset_conversion.generate_dataset_json import generate_dataset_json

channel_names = { 0: "T2"}
labels = {'background':0, 'prostate':1}
num_training_cases = 79
file_ending = ".nii.gz"
overwrite_image_reader_writer = "SimpleITKIO"
generate_dataset_json(str(nnUNetDatasetPath), channel_names=channel_names, labels=labels,num_training_cases=num_training_cases,file_ending=file_ending,overwrite_image_reader_writer=overwrite_image_reader_writer)

# Construct Lesion Dataset

## Define Path

In [27]:
sourcePath = Path("D:\Medical Image - Research\Lesion Segmentation\MRI with tumor label organized")
sourceFM = FolderMg(sourcePath)
sourceFM.ls()

sortDataPath = sourcePath.parent.joinpath("LesionDataset")
if not sortDataPath.exists():
    sortDataPath.mkdir(parents=True)


Current Folder 'MRI with tumor label organized' contains 78 folders, which are:
  - 付国胜
  - 付平
  - 何凤杰
  - 刘光荣
  - 刘卫国
  - ...

Current Folder 'MRI with tumor label organized' contains 342 files, which are:
  - Cai YX-ADC(1).nii
  - Cai YX-ADC.nii
  - Cai YX-T2(1).nii
  - Cai YX-T2.nii
  - Cao K-ADC(1).nii
  - ...


## Copy Data to New Folder and Prune

### Convert Chinese Folder to English Folder
e.g. 付平 will be Fu P

In [52]:
import pinyin

name_cn_en_map = {}
for d in tqdm.tqdm(sourceFM.dirs):
    patient_name = pinyin.get(d.stem, format="strip", delimiter=" ").split(" ")
    surname = patient_name[0].capitalize()
    name = [n.capitalize()[0] for n in patient_name[1:] if n != ""]
    name = "".join(name)
    new_d_name = f"{surname} {name}"
    if new_d_name in name_cn_en_map.keys():
        print(f"Duplicate name found, {d.stem} with {name_cn_en_map[new_d_name].stem}")
        new_d_name = f"{new_d_name}-{str(random.randint(0, 1000)).zfill(4)}"
    name_cn_en_map[new_d_name] = d

    new_d = sortDataPath.joinpath(new_d_name)
    if not new_d.exists():
        new_d.mkdir(parents=True)

    current_mg = FolderMg(d)
    while(current_mg.nDir == 1 and current_mg.nFile == 0):
        current_mg = FolderMg(current_mg.dirs[0])
    current_mg.copy_to(new_d)  


  0%|          | 0/78 [00:00<?, ?it/s]

 36%|███▌      | 28/78 [00:02<00:04, 11.49it/s]

Duplicate name found, 张治忠 with 张忠智


100%|██████████| 78/78 [00:06<00:00, 12.47it/s]


### Copy label to new place

In [29]:
for f in tqdm.tqdm(sourceFM.files):
    if "(1)" in f.name:
        continue
    shutil.copy2(f, sortDataPath)

100%|██████████| 342/342 [00:03<00:00, 102.40it/s]


### Prune Folder

In [33]:
sortDataFM = FolderMg(sortDataPath)
all_dicom_files = sortDataFM.get_file_recursive_by_extension("dcm")

for f in tqdm.tqdm(all_dicom_files):
    if "(1)" in f.name:
        f.unlink()
sortDataFM = FolderMg(sortDataPath)
sortDataFM.ls()
        


Current Folder 'LesionDataset' contains 77 folders, which are:
  - Cai YX
  - Cao K
  - Cao LY
  - Chen ZL
  - Cui PJ
  - ...

Current Folder 'LesionDataset' contains 171 files, which are:
  - Cai YX-ADC.nii
  - Cai YX-T2.nii
  - Cao K-ADC.nii
  - Cao K-T2.nii
  - Cao LY-ADC.nii
  - ...


100%|██████████| 4106/4106 [00:00<00:00, 1368657.09it/s]


Current Folder 'LesionDataset' contains 77 folders, which are:
  - Cai YX
  - Cao K
  - Cao LY
  - Chen ZL
  - Cui PJ
  - ...

Current Folder 'LesionDataset' contains 171 files, which are:
  - Cai YX-ADC.nii
  - Cai YX-T2.nii
  - Cao K-ADC.nii
  - Cao K-T2.nii
  - Cao LY-ADC.nii
  - ...





## Sort label to Groups
we have t2, adc and dwi

In [45]:
file_groups = {}
groups_key = ["T2", "ADC", "DWI"]
combine_groups = ["T2_ADC", "T2_DWI", "T2_DWI_ADC"]

for f in tqdm.tqdm(sortDataFM.files):
    patient_name = f.stem.split('-')[0]
    
    for k in groups_key:
        if k in f.name.upper():
            file_groups.setdefault(k, set()).add(patient_name)

# find T2_ADC, T2_DWI, T2_DWI_ADC
for k in combine_groups:
    file_for_key = set()
    
    list_of_set = []
    for sub_k in k.split("_"):
        if sub_k not in file_groups:
            print(f"{sub_k} is not in file_groups")
            continue
        list_of_set.append(file_groups[sub_k])
    
    if len(list_of_set) == 0:
        continue
    file_for_key = list_of_set[0]
    for i in range(1, len(list_of_set)):
        file_for_key = file_for_key.intersection(list_of_set[i])
    file_groups[k] = set(file_for_key)



for k in groups_key:
    print(f"{k}: {len(file_groups[k])}")
for k in combine_groups:
    print(f"{k}: {len(file_groups[k])}")

100%|██████████| 171/171 [00:00<00:00, 168679.68it/s]

T2: 76
ADC: 76
DWI: 17
T2_ADC: 75
T2_DWI: 16
T2_DWI_ADC: 16





In [49]:
assert file_groups["T2_DWI"] == file_groups["T2_DWI_ADC"]
print(list(file_groups["T2_DWI"])[:5])
print(list(file_groups["T2_DWI_ADC"])[:5])

['Miao DZ', 'Wu WS', 'Liu GR', 'Zhang ZZ', 'Liu LJ']
['Miao DZ', 'Wu WS', 'Liu GR', 'Zhang ZZ', 'Liu LJ']
