In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
import tqdm
import shutil
import SimpleITK as sitk
from pathlib import Path
from typing import List

from lib.folder.basic import FolderMg
from lib.folder.med import DicomImageFolderMg, DicomImageFolderMgITK

In [3]:
def findDuplicateFileName(file_list: List[Path]):
    file_names = []
    for f in file_list:
        f_name = f.name.split(".")[0]
        if f_name in file_names:
            print(f)
        else:
            file_names.append(f_name)
    print(f"Files should have {len(file_list)}, names have {len(file_names)}")
    return file_names

# Construct Prostate Dataset

In [3]:
sourceDataPath = Path(
    "D:\Medical Image - Research\Clinical Data Prostate Segmentation Image Packages\ReviewData"
)
sourceDataMg = FolderMg(sourceDataPath)
sourceDataMg.ls()

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'D:\\Medical Image - Research\\Clinical Data Prostate Segmentation Image Packages\\ReviewData'

## Get valid Data
1. according to the segmentation result, remove file with & in the name.
2. find the original file according to valid segmentation result file

In [None]:
validSegmentationFiles = []
validOriginalFiles = []
for d in sourceDataMg.dirs:
    print(d.name)
    tempOgMg = FolderMg(d)
    # tempOgMg.ls()
    tempSegMg = FolderMg(d.joinpath("Segmentation"))
    # tempSegMg.ls()

    useableFile = [f for f in tempSegMg.files if "&" not in f.name]
    for i, segf in enumerate(useableFile):
        segf_name = segf.name.split(".")[0]
        matchOgFiles = []
        for ogf in tempOgMg.files:
            ogf_name = ogf.stem
            if segf_name == ogf_name:
                matchOgFiles.append(ogf)
        # matchOgFiles = [ ogf for ogf in tempOgMg.files if segf_name in ogf.name and ogf.name in segf_name]
        if len(matchOgFiles) != 1:
            print(segf, matchOgFiles)
            break
        validOriginalFiles.extend(matchOgFiles)
    validSegmentationFiles.extend(useableFile)
assert len(validOriginalFiles) == len(validSegmentationFiles)

DataPack1
DataPack2
DataPack3
DataPack4
DataPack5
DataPack6
DataPack7
DataPack8
DataPack9
DataPack10
DataPack11
DataPack12
DataPack13
DataPack14


## Filter out the same size original data

In [None]:
sizeDict = {}
for i, f in enumerate(validOriginalFiles):
    key = f.stat().st_size
    if key in sizeDict:
        duplicatesFiles = sourceDataPath.parent.joinpath("dup", sizeDict[key].stem)
        if not duplicatesFiles.exists():
            Path.mkdir(duplicatesFiles, parents=True)
        shutil.copy2(f, duplicatesFiles)
        shutil.copy2(sizeDict[key], duplicatesFiles)
        # print(key, sizeDict[key], f)
    else:
        sizeDict[key] = f

In [None]:
len(sizeDict)

99

## Shuffle the original data

In [None]:
ogFiles = list(sizeDict.values())
random.shuffle(ogFiles)
print(list(sizeDict.values())[0])
print(ogFiles[0])
print(len(ogFiles))
ogFilesNames = findDuplicateFileName(ogFiles)
ogFilesNames[0]

D:\Medical Image - Research\Clinical Data Prostate Segmentation Image Packages\ReviewData\DataPack1\FollowMR_t2_tse_fdixon_tra_p2_256_in_RD_UT20211224151916.mha
D:\Medical Image - Research\Clinical Data Prostate Segmentation Image Packages\ReviewData\DataPack8\MainMR_t2_tse_fdixon_tra_p2_256_W_RD_UT20211224151916.mha
99
Files should have 99, names have 99


'MainMR_t2_tse_fdixon_tra_p2_256_W_RD_UT20211224151916'

## Find its corrsponding segmentation result given new original data 

In [None]:
ogSegPair = {}
for ogf in ogFiles:
    ogf_name = ogf.name.split(".")[0]
    matchFiles = []
    for segf in validSegmentationFiles:
        segf_name = segf.name.split(".")[0]
        if ogf_name == segf_name:
            matchFiles.append(segf)
            # print(ogf_name, segf_name)
    if len(matchFiles) != 1:
        print(ogf, matchFiles)
        break
    ogSegPair[ogf_name] = [ogf, matchFiles[0]]
len(ogSegPair)

99

## Construct Dataset and save to folder
convert .mha and .nrrd to nii.gz

In [None]:
datasetPath = sourceDataPath.parent.joinpath(f"Dataset078_UltrastProstate")
if not datasetPath.exists():
    datasetPath.mkdir(parents=True)
segmentationPath = datasetPath.joinpath("labels")
if not datasetPath.exists():
    datasetPath.mkdir(parents=True)
if not segmentationPath.exists():
    segmentationPath.mkdir(parents=True)

for k, v in ogSegPair.items():
    ogf, segf = v[0], v[1]
    ogf_name = ogf.name.split(".")[0]
    segf_name = segf.name.split(".")[0]
    if ogf_name != segf_name:
        print(ogf_name, segf_name)
        break
    
    # shutil.copy2(ogf, datasetPath)    
    shutil.copy2(segf, segmentationPath)
    outFileName = datasetPath.joinpath(f"{ogf.stem}.nii.gz")
    img = sitk.ReadImage(ogf)
    sitk.WriteImage(img, fileName=outFileName)
print("Dataset is done")

Dataset is done


## Convert Dataset to nnUNet structure

In [None]:
trainRatio = 0.8
nnUNetDatasetPath = datasetPath.parent.joinpath(datasetPath.name + f"{trainRatio}")
imageTrFolder = nnUNetDatasetPath.joinpath("imagesTr")
imageTsFolder = nnUNetDatasetPath.joinpath("imagesTs")
labelTrFolder = nnUNetDatasetPath.joinpath("labelsTr")
labelTsFolder = nnUNetDatasetPath.joinpath("labelsTs")
if not nnUNetDatasetPath.exists():
    nnUNetDatasetPath.mkdir(parents=True)
    imageTrFolder.mkdir(parents=True)
    imageTsFolder.mkdir(parents=True)
    labelTrFolder.mkdir(parents=True)
    labelTsFolder.mkdir(parents=True)
    
datasetFolderMg = FolderMg(datasetPath)
segmentationMg = FolderMg(segmentationPath)

nFile = datasetFolderMg.nFile
nTrain = int(nFile * 0.8)
for i, (ogf, segf) in enumerate(zip(datasetFolderMg.files, segmentationMg.files)):
    ogf_name = ogf.name.split(".")[0]
    segf_name = segf.name.split(".")[0]
    assert ogf_name == segf_name
    new_ogf_stem = f"Ultrast_{str(i).zfill(3)}_0000"
    new_segf_stem = f"Ultrast_{str(i).zfill(3)}"
    ogf_extension = "".join(ogf.suffixes)
    segf_extension = "".join(segf.suffixes)
    # print(new_ogf_stem, new_segf_stem, ogf_extension, segf_extension, new_ogf, new_segf)
    if i < nTrain:
        new_ogf = imageTrFolder.joinpath(new_ogf_stem + ogf_extension)
        new_segf = labelTrFolder.joinpath(new_segf_stem + segf_extension)
        shutil.copy2(ogf, new_ogf)
        shutil.copy2(segf, new_segf)
    else:
        new_ogf = imageTsFolder.joinpath(new_ogf_stem + ogf_extension)
        new_segf = labelTsFolder.joinpath(new_segf_stem + segf_extension)
        shutil.copy2(ogf, new_ogf)
        shutil.copy2(segf, new_segf)

print(nFile, nTrain)

99 79


## Generate dataset.json

In [None]:
from nnunetv2.dataset_conversion.generate_dataset_json import generate_dataset_json

channel_names = { 0: "T2"}
labels = {'background':0, 'prostate':1}
num_training_cases = 79
file_ending = ".nii.gz"
overwrite_image_reader_writer = "SimpleITKIO"
generate_dataset_json(str(nnUNetDatasetPath), channel_names=channel_names, labels=labels,num_training_cases=num_training_cases,file_ending=file_ending,overwrite_image_reader_writer=overwrite_image_reader_writer)

# Construct Lesion Dataset

## Define Path

In [8]:
sourcePath = Path("/home/ultrast/LesionSegmentation/")

In [None]:
sourceData = sourcePath.joinpath("MRI with tumor label organized")
sourceFM = FolderMg(sourceData)
sourceFM.ls()

In [9]:
sortDataPath = sourcePath.joinpath("LesionDataset")
if not sortDataPath.exists():
    sortDataPath.mkdir(parents=True)

name_cn_en_map = {}
duplicate_name = []
sortDataFM = FolderMg(sortDataPath)
sortDataFM.nDir, sortDataFM.nFile

(76, 166)

## Copy Data to New Folder and Prune

### Convert Chinese Folder to English Folder (if Prune before can skip)
e.g. 付平 will be Fu P

In [25]:
import pinyin
for d in tqdm.tqdm(sourceFM.dirs):
    patient_name = pinyin.get(d.stem, format="strip", delimiter=" ").split(" ")
    surname = patient_name[0].capitalize()
    name = [n.capitalize()[0] for n in patient_name[1:] if n != ""]
    name = "".join(name)
    new_d_name = f"{surname} {name}"
    if new_d_name in name_cn_en_map.keys():
        print(f"Duplicate name found, {d.stem} with {name_cn_en_map[new_d_name].stem}")
        duplicate_name.append(new_d_name)
        new_d_name = f"{new_d_name} - {random.randint(0, 100)}"
    name_cn_en_map[new_d_name] = d

    new_d = sortDataPath.joinpath(new_d_name)
    if not new_d.exists():
        new_d.mkdir(parents=True)

    current_mg = FolderMg(d)
    while(current_mg.nDir == 1 and current_mg.nFile == 0):
        current_mg = FolderMg(current_mg.dirs[0])
    current_mg.copy_to(new_d)  

for f in tqdm.tqdm(sourceFM.files):
    shutil.copy2(f, sortDataPath)


 36%|███▌      | 28/78 [00:02<00:04, 11.54it/s]

Duplicate name found, 张治忠 with 张忠智


100%|██████████| 78/78 [00:07<00:00,  9.79it/s]
100%|██████████| 342/342 [00:09<00:00, 37.94it/s]


### Prune Folder and Files (if pruned can skip)

In [26]:
sortDataFM = FolderMg(sortDataPath)
all_dicom_files = sortDataFM.get_file_recursive_by_extension("dcm")

for f in tqdm.tqdm(all_dicom_files):
    if "(1)" in f.name or "(2)" in f.name:
        f.unlink()

for f in tqdm.tqdm(sortDataFM.files):
    if "(1)" in f.name or "(2)" in f.name:
        f.unlink()
        

100%|██████████| 8212/8212 [00:00<00:00, 15869.30it/s]
100%|██████████| 342/342 [00:00<00:00, 1611.37it/s]


In [10]:
sortDataFM = FolderMg(sortDataPath)
sortDataFM.nDir, sortDataFM.nFile

(76, 166)

### Handle Duplicated Name (if pruned can skip)

In [28]:
if len(duplicate_name) > 0:
    print("Duplicate name found")
    for n in duplicate_name:
        for f in sortDataFM.files:
            if n in f.stem:
                f.unlink()
                print(f"File {f.stem} is deleted")
        for d in sortDataFM.dirs:
            if n in d.stem:
                shutil.rmtree(d)
                print(f"Folder {d.stem} is deleted")
sortDataFM = FolderMg(sortDataPath)
sortDataFM.nDir, sortDataFM.nFile

Duplicate name found
File Zhang ZZ-ADC is deleted
File Zhang ZZ-T2 is deleted
File Zhang ZZ-sDWI is deleted
Folder Zhang ZZ is deleted
Folder Zhang ZZ - 48 is deleted


(76, 166)

## Sort label to Groups
we have t2, adc and dwi

In [41]:
modality_name_map = {}
groups_key = ["T2", "ADC", "DWI"]
combine_groups = ["T2_ADC", "T2_DWI", "T2_DWI_ADC"]

for f in tqdm.tqdm(sortDataFM.files):
    patient_name = f.stem.split('-')[0].upper()
    
    for k in groups_key:
        if k in f.name.upper():
            modality_name_map.setdefault(k, set()).add(patient_name)


# find T2_ADC, T2_DWI, T2_DWI_ADC
for k in combine_groups:
    file_for_key = set()
    
    list_of_set = []
    for sub_k in k.split("_"):
        if sub_k not in modality_name_map:
            print(f"{sub_k} is not in file_groups")
            continue
        list_of_set.append(modality_name_map[sub_k])
    
    if len(list_of_set) == 0:
        continue
    file_for_key = list_of_set[0]
    for i in range(1, len(list_of_set)):
        file_for_key = file_for_key.intersection(list_of_set[i])
    modality_name_map[k] = set(file_for_key)



for k in groups_key:
    print(f"{k}: {len(modality_name_map[k])}")
for k in combine_groups:
    print(f"{k}: {len(modality_name_map[k])}")

100%|██████████| 166/166 [00:00<00:00, 478854.51it/s]

T2: 75
ADC: 75
DWI: 16
T2_ADC: 74
T2_DWI: 15
T2_DWI_ADC: 15





In [45]:
for d in sortDataFM.dirs:
    patient_name = d.stem.upper()
    if patient_name not in modality_name_map["T2"]:
        print(f"{patient_name} is not in T2 group")
    if patient_name not in modality_name_map["ADC"]:
        print(f"{patient_name} is not in ADC group")

assert modality_name_map["T2_DWI"] == modality_name_map["T2_DWI_ADC"]
print(list(modality_name_map["T2_DWI"])[:5])
print(list(modality_name_map["T2_DWI_ADC"])[:5])

print(modality_name_map["T2"].difference(modality_name_map["ADC"]))
print(modality_name_map["ADC"].difference(modality_name_map["T2"]))
        

CHEN ZL is not in T2 group
YAN HX is not in ADC group
['WEI L', 'FU P', 'WU WS', 'MENG ZK', 'TIAN HY']
['DUAN ZW', 'WEI L', 'FU P', 'WU WS', 'MENG ZK']
{'YAN HX'}
{'CHEN ZL'}


### Find name to files path matches

In [46]:
name_modality_map = {}
for k in modality_name_map.keys():
    for name in modality_name_map[k]:
        name_modality_map.setdefault(name,set()).add(k)

In [55]:
name_folder_map = {}
for d in sortDataFM.dirs:
    if d.name.upper() in name_modality_map.keys():
        name_folder_map[d.name.upper()] = d.name

In [56]:
name_file_group = {}
for d in tqdm.tqdm(sortDataFM.dirs):
    patient_name = d.stem.upper()
    for f in sortDataFM.files:
        f_patient_name, modality = f.stem.split("-")
        if f_patient_name.upper() == patient_name:
            name_file_group.setdefault(patient_name, {}).setdefault(modality, []).append(f)

100%|██████████| 76/76 [00:00<00:00, 7748.16it/s]


## Build a T2 only dataset

## create a source folder and put original Data inside

In [57]:
T2sourcePath = sortDataPath.parent.joinpath("T2LesionDataset")
if not T2sourcePath.exists():
    T2sourcePath.mkdir(parents=True)

In [63]:
for name in tqdm.tqdm(modality_name_map["T2"]):
    files = name_file_group[name]["T2"]
    dicom_folder_mg = DicomImageFolderMgITK(sortDataPath.joinpath(name_folder_map[name]))
    print(sortDataPath.joinpath(name).absolute())
    for dcm in dicom_folder_mg.dicomSeries:
        print(dcm.necessaryTagsValue.keys())
        break
    break

  0%|          | 0/75 [00:00<?, ?it/s]

- 1.3.46.670589.11.78356.5.0.2084.2023063009304801000 is a dicom series
- 1.3.46.670589.11.78356.5.0.6536.2023063009194753937 is a dicom series
read all dicom series
- 1.3.46.670589.11.78356.5.0.2084.2023063009304801000 read


  0%|          | 0/75 [00:00<?, ?it/s]

- 1.3.46.670589.11.78356.5.0.6536.2023063009194753937 read
/home/ultrast/LesionSegmentation/LesionDataset/ZHANG LS
dict_keys([])





In [68]:
dcm.image.HasMetaDataKey("(0010|0020)")
dcm.metaData

('ITK_non_uniform_sampling_deviation',)

## Build a T2 and ADC dataset