In [2]:
import os
import sys
sys.path.append(r"/Users/LennartPhilipp/Desktop/Uni/Prowiss/Code/Brain_Mets_Classification")

from tqdm import tqdm
from datetime import datetime
import shutil
import pandas as pd
from dateutil.relativedelta import relativedelta
from nipype.interfaces.dcm2nii import Dcm2niix

import brain_mets_classification.config as config
import brain_mets_classification.custom_funcs as funcs
import brain_mets_classification.preprocessing_funcs as preprocessing

In [2]:
path_to_folder = f"{config.path_to_ssd}/originalPatientFiles"
#path_to_folder = f"{config.path_to_n30}/RgbBrainMetsSampleN30"
pathToCleanMRIList = ""

### Step 0: Removes sequences based on the blacklist

In [3]:
#Example file structure
#├── Anonymized - 01005097
#│   └── Mrt Body
#│       ├── Diffusion trace tra schnell_ADC - 8
#│       │   ├── IM-2330-0001-0001.dcm
#│       │   ├── IM-2330-0002-0001.dcm
#...
#│       │   ├── IM-2469-0026-0001.dcm
#│       │   └── IM-2469-0027-0001.dcm
#│       ├── Diffusion trace tra schnell_TRACEW - 7
#│       │   ├── IM-2329-0001-0001.dcm
#│       │   ├── IM-2329-0002-0001.dcm
#...
#│       │   ├── IM-2468-0026-0001.dcm
#│       │   └── IM-2468-0027-0001.dcm
#│       ├── T1 mp-rage3d we sag 1mm KM - 13
#│       │   ├── IM-2335-0001-0001.dcm
#│       │   ├── IM-2335-0002-0001.dcm
#...
#│       │   ├── IM-2474-0159-0001.dcm
#│       │   └── IM-2474-0160-0001.dcm
#├── Anonymized - 12345678
#...

blackList = ["auswertung_fmrt",
             "fmri",
             "thorax",
             "lws", "hws", "bws", "ws",
             "hand",
             "posdisp",
             "cor", "sag",
             "cest",
             "ciss",
             "dti",
             "evidence", "reading",
             "field",
             "evaseries",
             "ct",
             "lokalizer", "localizer",
             "mip", "protocol", "resolve", "results", "screen save", "sub", "svs", "tof", "mean_", "leakage", "lunge"]

# Creates a new directory for all the patient folders
pathToCleanMRIList = funcs.createNewPreprocessingStepFolder("0_blacklist")

# Goes through list of files/folders at path_to_folder and only adds the directories to the list
folderList = [
    folder for folder in os.listdir(path_to_folder) if os.path.isdir(os.path.join(path_to_folder, folder))
]

patientIDs = []

sequencesList = []


# Loops through all the "Anonymized - #######" folders
for patient_folder in tqdm(folderList):

    # ignores the ds_folders
    if config.dsStore in patient_folder:
        continue

    # all folders are named like "Anonymized - 12345678"
    patientID = patient_folder.split(" - ")[1]

    # adds the patientID to the list patientIDS if it hasn't been added before
    if patientID not in patientIDs:
        patientIDs.append(patientID)

    path_to_MRI_session_folders = os.path.join(path_to_folder, patient_folder)
    MRI_session_folders = os.listdir(path_to_MRI_session_folders)

    # loops through the different MRI sessions
    for mri_session in MRI_session_folders:
        
        # ignores the ds_folders
        if config.dsStore in mri_session:
            continue

        # get a list of all the sequences in the mri_sequences
        path_to_mri_sequences = os.path.join(path_to_MRI_session_folders, mri_session)
        mri_sequences = os.listdir(path_to_mri_sequences)

        # loops through the different sequences created during each MRI session
        for sequence in mri_sequences:

            # ignores the ds_folders
            if config.dsStore in sequence:
                continue

            sequences_lower_cased = sequence.lower()

            if not any(blackListSeq in sequences_lower_cased for blackListSeq in blackList):

                # create folder for patient
                funcs.createFolderForPatient(
                    path = pathToCleanMRIList,
                    patientID = patientID
                )

                # # create new folder as pathToCleanMRIList/patientID/T1CE
                # path_to_sequence = funcs.createSequenceFolder(
                #     path = os.path.join(pathToCleanMRIList, patientID),
                #     patientID = patientID,
                #     sequence = "",
                #     sequence_list = [],
                #     original_sequence_name = sequence
                # )

                pathToPatient = os.path.join(pathToCleanMRIList, patientID)
                folderName = f"{patientID}_{sequence}"
                path_to_sequence = os.path.join(pathToPatient, folderName)
                # check if path_to_sequence already exists then create directory
                sequences = os.listdir(pathToPatient)
                counter = sequences.count(folderName)
                if counter >= 1:
                    path_to_sequence = f"{path_to_sequence}{counter + 1}"
                    os.mkdir(path_to_sequence)
                else:
                    os.mkdir(path_to_sequence)

                # get list of all the dicom files for the T1CE sequence
                dicomFiles = os.listdir(os.path.join(path_to_mri_sequences, sequence))

                # loops through the list of dicom files
                for dicomFile in dicomFiles:
                    # ignores the ds_folders
                    if config.dsStore in dicomFile:
                        continue

                    # copy each file individually into the path_to_sequence folder
                    shutil.copyfile(os.path.join(path_to_mri_sequences, sequence, dicomFile), os.path.join(path_to_sequence, dicomFile))



#             sequencesList.append(sequences_lower_cased)

# cleanList = list(dict.fromkeys(sorted(sequencesList)))

# minusNumberList = []

# for sequence in cleanList:
#     sequenceWithoutNumber = sequence.split(" - ")[0]
#     minusNumberList.append(sequenceWithoutNumber)

# cleanMinusNumber = list(dict.fromkeys(minusNumberList))









# removedSequences = []

# for sequence in cleanMinusNumber:
#     if not any(backListSequ in sequence for backListSequ in blackList): #and any(whiteListSeq in sequence for whiteListSeq in whiteList):
#         print(sequence)
#     else:
#         removedSequences.append(sequence)

# print(f"\n\nRemoved Sequences: {len(removedSequences)}")
# for removedSequence in removedSequences:
#     print(removedSequence)

100%|██████████| 373/373 [07:29<00:00,  1.21s/it]


### Step 0.5: removes the patients with less than 4 sequences from the previously created folder

In [4]:
filterFolderList = [
    folder for folder in os.listdir(pathToCleanMRIList) if os.path.isdir(os.path.join(pathToCleanMRIList, folder))
]

# loops through all the "12345678" folders
for patient_folder in tqdm(filterFolderList):

    # ignores the ds_folders
    if config.dsStore in patient_folder:
        continue
    
    path_to_patient = os.path.join(pathToCleanMRIList, patient_folder)

    # list to the mri sessions folders
    mri_sessions = os.listdir(path_to_patient)

    if len(mri_sessions) < 4:
        os.system(f"rm -r {path_to_patient}")

100%|██████████| 369/369 [00:01<00:00, 247.42it/s]


In [3]:
#print(pathToCleanMRIList)
pathToCleanMRIList = "/Volumes/BrainMets/Rgb_Brain_Mets/allPatients/Rgb_Brain_Mets_Preprocessing_0_blacklist_20240120-155725"

## Step 1: Filters the different Sequences

In [4]:
pathToPreprocess1 = funcs.createNewPreprocessingStepFolder("01_sequencesFiltered")

T1List = ["t1"]
T2List = ["t2", "flair"]
T2SternList = ["stern", "hemo", "blutung", "*", "hämo"]
FLAIRList = ["flair"]
KMList = ["km"]
ADCList = ["adc"]
DWIList = ["diffusion", "diff", "adc", "dwi"]
MPRList = ["mpr"]

# list of patientIDs
folderPreprocess0List = [
    folder for folder in os.listdir(pathToCleanMRIList) if os.path.isdir(os.path.join(pathToCleanMRIList, folder))
]

# list of list of sequences for the DataFrame
dfpatientIDs = [] # list of ints
dfT1_sequences = [] # list of strings
dfT1_amount = [] # list of ints
dfT1CE_sequences = [] # list of strings
dfT1CE_amount = [] # list of ints
dfT2_sequences = [] # list of strings
dfT2_amount = [] # list of ints
dfFLAIR_sequences = [] # list of strings
dfFLAIR_amount = [] # list of ints
dfSTERN_sequences = [] # list of strings
dfSTERN_amount = [] # list of ints
dfDWI_sequences = [] # list of strings
dfDWI_amount = [] # list of ints
dfADC_sequences = [] # list of strings
dfADC_amount = [] # list of ints
dfMPR_sequences = [] # list of strings
dfMPR_amount = [] # list of ints
dfrejected_sequences = [] # list of strings
dfrejected_amount = [] # list of ints
df_has_rejected = [] # list of bools
df_has_duplicates = [] # list of bools

dfPatients = [[]]

# Loops through all the "#######" folders
for patient_folder in tqdm(folderPreprocess0List):

    T1_sequences = []
    T1CE_sequences = []
    T2_sequences = []
    FLAIR_sequences = []
    STERN_sequences = []
    DWI_sequences = []
    ADC_sequences = []
    MPR_sequences = []

    rejected_sequences = []

    has_duplicates = False
    has_rejected = False

    patientID = patient_folder

    # ignores the ds_folders
    if config.dsStore in patient_folder:
        continue
    
    path_to_patient_sequences = os.path.join(pathToCleanMRIList, patient_folder)
    mri_sequences = os.listdir(path_to_patient_sequences)

    # create new folder for patient in pathToPreprocess1

    # loop through the sequences for each patient
    for sequence in mri_sequences:

        # only get the sequence name, folder names: "12345678_SEQUENCENAME"
        sequence_name = sequence.split("_", 1)[1]
        sequence_lower_cased = sequence_name.lower()

        # create folder for patient
        funcs.createFolderForPatient(
            path = pathToPreprocess1,
            patientID = patientID
        )

        #T1
        if any(t1Sequence in sequence_lower_cased for t1Sequence in T1List):
            # sequence contains "t1" in its name
            # check if it's T1CE or not, if not it's regular t1, if it is then it's a T1CE sequence
            if any(KMSequence in sequence_lower_cased for KMSequence in KMList):
                # it's a T1CE sequence
                # rename like this: {patientID}_T1CE-{sequence}

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.T1CE.value,
                    sequence_list = T1CE_sequences,
                    original_sequence_name = sequence_name)
                
                T1CE_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)

            else:
                # it's a normale T1 Sequence
                # rename like this: {patientID}_T1-{sequence}

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.T1.value,
                    sequence_list = T1_sequences,
                    original_sequence_name = sequence_name)
                
                T1_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)
                
        elif any(t2Sequence in sequence_lower_cased for t2Sequence in T2List):
            # sequence containts "t2" in its name
            # check if it's a FLAIR, a STAR or a regular T2 sequence

            if any(flairSequence in sequence_lower_cased for flairSequence in FLAIRList):
                # it's a FLAIR sequence
                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.FLAIR.value,
                    sequence_list = FLAIR_sequences,
                    original_sequence_name = sequence_name)
                
                FLAIR_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)

            elif any(sternSequence in sequence_lower_cased for sternSequence in T2SternList):
                # it's a STERN sequence

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.STERN.value,
                    sequence_list = STERN_sequences,
                    original_sequence_name = sequence_name)
                
                STERN_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)
                
            else:
                # it's a normale T2 sequence

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.T2.value,
                    sequence_list = T2_sequences,
                    original_sequence_name = sequence_name)
                
                T2_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)
        
        elif any(diffSequence in sequence_lower_cased for diffSequence in DWIList):
            # sequence contains "diffusion" or "dif" in it's name
            
            if any(adcSequence in sequence_lower_cased for adcSequence in ADCList):
                # it's a adc sequence

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.ADC.value,
                    sequence_list = ADC_sequences,
                    original_sequence_name = sequence_name)
                
                ADC_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)
                
            else:
                # it's a normal dif sequence

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.DWI.value,
                    sequence_list = DWI_sequences,
                    original_sequence_name = sequence_name)
                
                DWI_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)

        elif any(mprsequence in sequence_lower_cased for mprsequence in MPRList):
            # it's a mpr sequence

            # create new folder in preprocessing1/patientID/folder name above
            path_to_sequence = funcs.createSequenceFolder(
                path = f"{pathToPreprocess1}/{patientID}",
                patientID =  patientID,
                sequence = config.desiredSequences.MPR.value,
                sequence_list = MPR_sequences,
                original_sequence_name = sequence_name)
                
            MPR_sequences.append(sequence_name)

            # copy files to new folder
            funcs.copyFilesFromDirectoryToNewDirectory(
                path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                path_to_new_directory = path_to_sequence)

        else:
            # file got rejected
            rejected_sequences.append(sequence_name)
    

    # code for the dataframe goes here
            
    list_of_sequences = [T1_sequences, T1CE_sequences, T2_sequences, FLAIR_sequences, STERN_sequences, DWI_sequences, ADC_sequences, MPR_sequences]
    
    dfpatientIDs.append(patientID)

    dfT1_sequences.append(T1_sequences)
    dfT1_amount.append(len(T1_sequences))

    dfT1CE_sequences.append(T1CE_sequences)
    dfT1CE_amount.append(len(T1CE_sequences))

    dfT2_sequences.append(T2_sequences)
    dfT2_amount.append(len(T2_sequences))

    dfFLAIR_sequences.append(FLAIR_sequences)
    dfFLAIR_amount.append(len(FLAIR_sequences))

    dfSTERN_sequences.append(STERN_sequences)
    dfSTERN_amount.append(len(STERN_sequences))

    dfDWI_sequences.append(DWI_sequences)
    dfDWI_amount.append(len(DWI_sequences))

    dfADC_sequences.append(ADC_sequences)
    dfADC_amount.append(len(ADC_sequences))

    dfMPR_sequences.append(MPR_sequences)
    dfMPR_amount.append(len(MPR_sequences))

    dfrejected_sequences.append(rejected_sequences)
    dfrejected_amount.append(len(rejected_sequences))

    # has duplicates
    for sequenceList in list_of_sequences:
        if len(sequenceList) > 1:
            has_duplicates = True

    df_has_duplicates.append(has_duplicates)

    # has rejected
    if rejected_sequences:
        has_rejected = True
    
    df_has_rejected.append(has_rejected)

    #dfPatients.append([len(T1_sequences), T1_sequences, len(T1CE_sequences), T1CE_sequences, len(T2_sequences), T2_sequences, len(FLAIR_sequences), FLAIR_sequences, len(STERN_sequences), STERN_sequences, len(DWI_sequences), DWI_sequences, len(ADC_sequences), ADC_sequences, len(rejected_sequences), rejected_sequences, has_rejected, has_duplicates])
    

100%|██████████| 319/319 [29:58<00:00,  5.64s/it]


#### Creates a Dataframe for the patients

In [6]:
# create patient Dataframe

patientsDataFrame = pd.DataFrame(list(zip(
             dfT1_amount,
             dfT1_sequences,
             dfT1CE_amount,
             dfT1CE_sequences,
             dfT2_amount,
             dfT2_sequences,
             dfFLAIR_amount,
             dfFLAIR_sequences,
             dfSTERN_amount,
             dfSTERN_sequences,
             dfDWI_amount,
             dfDWI_sequences,
             dfADC_amount,
             dfADC_sequences,
             dfMPR_amount,
             dfMPR_sequences,
             dfrejected_amount,
             dfrejected_sequences,
             df_has_rejected,
             df_has_duplicates)),
    columns = ["T1 amount",
               "T1 sequences",
               "T1CE amount",
               "T1CE sequences",
               "T2 amount",
               "T2 sequences",
               "FLAIR amount",
               "FLAIR sequences",
               "STERN amount",
               "STERN sequences",
               "DWI amount",
               "DWI sequences",
               "ADC amount",
               "ADC seqeunces",
               "MPR amount",
               "MPR sequences",
               "rejected amount",
               "rejected sequences",
               "has rejected",
               "has duplicates"],
    index = dfpatientIDs)

saves the Dataframe as a .csv file

In [7]:
patientsDataFrame.to_csv(f"{config.path_to_ssd}/patientsDataframe.csv")

get the Dataframe from the .csv file

In [3]:
patientsDataFrame = pd.read_csv(f"{config.path_to_ssd}/patientsDataframe.csv")

Display statistics about the patients dataframe

In [8]:
# To-Do:
# get amount of patients with T1, T1CE, T2, FLAIR sequences
# get amount of patients with T1, T1CE, T2, FLAIR + STERN sequences
# get amount of patients with T1, T1CE, T2, FLAIR + STERN + DWI + ADC sequences
# get patient IDs of patients with duplicates
# get patient IDs of patients with rejections
# check rejected files

# get amount of patients with T1, T1CE, T2, FLAIR sequences
patient4Seq = patientsDataFrame.query("`T1 amount` > 0 and `T1CE amount` > 0 and `T2 amount` > 0 and `FLAIR amount` > 0")
patientCount4Seq = len(patient4Seq)
print(f"Amount of patients with 4 sequences: {patientCount4Seq}")

# get amount of patients with T1, T1CE, T2, FLAIR + STERN sequences
patientCount5Seq = len(patientsDataFrame.query("`T1 amount` > 0 and `T1CE amount` > 0 and `T2 amount` > 0 and `FLAIR amount` > 0 and `STERN amount` > 0"))
print(f"Amount of patients with the 4 sequences + STERN: {patientCount5Seq}")

# get amount of patients with T1, T1CE, T2, FLAIR + DWI + ADC sequences
patientCount6Seq = len(patientsDataFrame.query("`T1 amount` > 0 and `T1CE amount` > 0 and `T2 amount` > 0 and `FLAIR amount` > 0 and `DWI amount` > 0 and `ADC amount` > 0"))
print(f"Amount of patients with the 4 sequences + DWI + ADC: {patientCount6Seq}")

# get amount of patients with T1, T1CE, T2, FLAIR + STERN + DWI + ADC sequences
patientCount7Seq = len(patientsDataFrame.query("`T1 amount` > 0 and `T1CE amount` > 0 and `T2 amount` > 0 and `FLAIR amount` > 0 and `STERN amount` > 0 and `DWI amount` > 0 and `ADC amount` > 0"))
print(f"Amount of patients with the 4 sequences + STERN + DWI + ADC: {patientCount7Seq}")

# get patient IDs of patients with duplicates
patientIDsWithDuplicates = patientsDataFrame[patientsDataFrame["has duplicates"] == True].index
print(f"List of patientIDs that have duplicates (#{len(patientIDsWithDuplicates)})")
for patientID in patientIDsWithDuplicates:
    print(patientID)


# get patient IDs of patients with rejections
patientIDsWithRejectedSequences = patientsDataFrame[patientsDataFrame["has rejected"] == True].index
print(f"List of patientIDs that have rejected Sequences (#{len(patientIDsWithRejectedSequences)})")
for patientID in patientIDsWithRejectedSequences:
    print(patientID)

Amount of patients with 4 sequences: 282
Amount of patients with the 4 sequences + STERN: 205
Amount of patients with the 4 sequences + DWI + ADC: 273
Amount of patients with the 4 sequences + STERN + DWI + ADC: 202
List of patientIDs that have duplicates (#60)
01199093
01879950
02033082
02026964
02088565
01041137
01874079
01659187
01609293
02173158
01983233
01921604
01641960
01800439
01979317
01482000
02161647
02210001
01400779
01668799
01516702
01484016
01443624
01321873
01819252
01946271
01578955
01072344
01764802
02010452
01548397
02066445
01792771
02066814
02051037
01257796
01670714
01104996
02194539
01699532
01563052
01798755
01760947
01360726
02105939
01190670
01013277
01021714
02184584
01943022
01755816
01800184
01410317
02116290
02034046
02046093
02117549
02090584
02089657
01882989
List of patientIDs that have rejected Sequences (#51)
01199093
01456719
02033082
01789555
02020631
01041137
02180229
02173158
02120806
01117914
01673666
01983233
01147272
01641960
01800439
01905692


## Step 2: Remove " " from the folder names as dicom2niix doesn't seam to work with spaces in the folder name

In [9]:
# To-Do:
# go through each patient folder and replace " " with "+"


#path_to_filtered_patient_files = os.path.join(config.path_to_n30, "/Users/LennartPhilipp/Desktop/Uni/Prowiss/Code/Brain_Mets_Classification/Rgb_Brain_Mets_Dataset/N30/Rgb_Brain_Mets_Preprocessing_01_sequencesFiltered_20240118-171818")
path_to_filtered_patient_files = pathToPreprocess1

patient_folders = [
    folder for folder in os.listdir(path_to_filtered_patient_files) if os.path.isdir(os.path.join(path_to_filtered_patient_files, folder))
]

for patient_folder in tqdm(patient_folders):

    # ignores the ds_folders
    if config.dsStore in patient_folder:
        continue

    patientID = patient_folder

    path_to_patient = os.path.join(path_to_filtered_patient_files, patient_folder)

    # get the different sequence folders for each patient
    sequences = [
        folder for folder in os.listdir(path_to_patient) if os.path.isdir(os.path.join(path_to_patient, folder))
    ]

    for sequence in sequences:

        # ignores the ds_folders
        if config.dsStore in sequence:
            continue

        newFolderName = sequence.replace(" ", "+")
        os.rename(os.path.join(path_to_patient, sequence), os.path.join(path_to_patient, newFolderName))

  0%|          | 0/319 [00:00<?, ?it/s]

100%|██████████| 319/319 [00:01<00:00, 271.86it/s]


## Number comparison between the first search and the second search

First Search:
- Amount of patients with 4 sequences: 282
- Amount of patients with the 4 sequences + STERN: **199**
- Amount of patients with the 4 sequences + DWI + ADC: 273
- Amount of patients with the 4 sequences + STERN + DWI + ADC: **197**
- List of patientIDs that have duplicates (#**67**)

Second Search:
- Amount of patients with 4 sequences: 282
- Amount of patients with the 4 sequences + STERN: **205**
- Amount of patients with the 4 sequences + DWI + ADC: 273
- Amount of patients with the 4 sequences + STERN + DWI + ADC: **202**
- List of patientIDs that have duplicates (#**63**)
- List of patientIDs that have rejected Sequences (#80)

To-Do:
1. [ ] Create two Dataframes and compare them with eachother
2. [ ] Report to Quirin
3. [ ] Go to Preprocessing

In [14]:
path_to_1st_DF = "/Volumes/BrainMets/Rgb_Brain_Mets/allPatients/patientsDataframe0.csv"
path_to_1st_modified_Data = "/Volumes/BrainMets/Rgb_Brain_Mets/allPatients/Rgb_Brain_Mets_Preprocessing1_20240111-215606"
path_to_2nd_DF = "/Volumes/BrainMets/Rgb_Brain_Mets/allPatients/patientsDataframe.csv"

In [13]:
first_DF = pd.read_csv(path_to_1st_DF)
second_DF = pd.read_csv(path_to_2nd_DF)

Create a Dataframe based on the patient files

In [25]:
patient_folders = [
    folder for folder in os.listdir(path_to_1st_modified_Data) if os.path.isdir(os.path.join(path_to_1st_modified_Data, folder))
]

patientIDList = []
patientsLists = [[]]

for patient_folder in tqdm(patient_folders):

    # ignores the ds_folders
    if config.dsStore in patient_folder:
        continue

    T1_sequences = []
    T1_amount = []
    T1CE_sequences = []
    T1CE_amount = []
    T2_sequences = []
    T2_amount = []
    FLAIR_sequences = []
    FLAIR_amount = []
    STERN_sequences = []
    STERN_amount = []
    DWI_sequences = []
    DWI_amount = []
    ADC_sequences = []
    ADC_amount = []
    MPR_sequences = []
    MPR_amount = []

    has_duplicates = False

    # cannot read the rejected sequences as they're not in the folders

    patientID = patient_folder

    path_to_patient_sequences = os.path.join(path_to_1st_modified_Data, patient_folder)
    mri_sequences = os.listdir(path_to_patient_sequences)

    for sequence in mri_sequences:

        if config.dsStore in sequence:
            continue

        # each sequences is written like this "{patientID}_{sequenceType}_{amountOfSequence}_{NameOfSequence}"

        sequenceType = sequence.split("_")[1] # like T1, T1CE, T2...
        sequenceName = sequence.split("_", 3)[3]
        #print(f"{sequenceType}: {sequenceName}")

        if sequenceType == config.desiredSequences.T1.value:
            # T1
            T1_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.T1CE.value:
            # T1CE
            T1CE_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.T2.value:
            # T2
            T2_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.FLAIR.value:
            # FLAIR
            FLAIR_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.STERN.value:
            # STERN
            STERN_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.DWI.value:
            # DWI
            DWI_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.ADC.value:
            # ADC
            ADC_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.MPR.value:
            # MPR
            MPR_sequences.append(sequenceName)
        else:
            print(f"Error: couldn't match sequence name {sequenceName} to sequence type")
    
    patientIDList.append(patientID)

    T1_amount = len(T1_sequences)
    T1CE_amount = len(T1CE_sequences)
    T2_amount = len(T2_sequences)
    FLAIR_amount = len(FLAIR_sequences)
    STERN_amount = len(STERN_sequences)
    DWI_amount = len(DWI_sequences)
    ADC_amount = len(ADC_sequences)
    MPR_amount = len(MPR_sequences)

    for sequenceAmounts in [T1_amount, T1CE_amount, T2_amount, FLAIR_amount, STERN_amount, DWI_amount, ADC_amount, MPR_amount]:
        if sequenceAmounts > 1:
            has_duplicates = True

    patientsLists.append([T1_amount, T1_sequences,
                          T1CE_amount, T1CE_sequences,
                          T2_amount, T2_sequences,
                          FLAIR_amount, FLAIR_sequences,
                          STERN_amount, STERN_sequences,
                          DWI_amount, DWI_sequences,
                          ADC_amount, ADC_sequences,
                          MPR_amount, MPR_sequences,
                          has_duplicates])

patientsLists.pop(0)

first_modified_data_DF = pd.DataFrame(patientsLists,
                                      columns = ["T1 amount", "T1 sequences",
                                                 "T1CE amount", "T1CE sequences",
                                                 "T2 amount", "T2 sequences",
                                                 "FLAIR amount", "FLAIR sequences",
                                                 "STERN amount", "STERN sequences",
                                                 "DWI amount", "DWI sequences",
                                                 "ADC amount", "ADC sequences",
                                                 "MPR amount", "MPR sequences",
                                                 "has duplicates"],
                                       index = patientIDList)

100%|██████████| 321/321 [00:00<00:00, 5819.20it/s]


## Step 3: Compare list of patients from the excel sheet with the list of patients that have the right MRI sequences
- get list of excel patients
- get list of mri patients
- patients that exist in both lists should be copied to new directory, this is basically the dataset that can be publicized (don't forget the duplicate sequences though)
- from that new directory get patients that have T1, T1CE, T2 and FLAIR sequences
- run preprocessing on those

In [2]:
# get list of excel patients
path_to_patients_csv = "/Users/LennartPhilipp/Desktop/Uni/Prowiss/Code/patientsIDsexbirthdateprimary.csv"
excel_patients = pd.read_csv(path_to_patients_csv)

excel_patients_ids = excel_patients["ID"].values.astype(int)

copy patients that are both in the excel sheet and also in the directory in new directory

In [4]:
# get list of mri patients
path_to_mri_sequences = "/Volumes/BrainMets/Rgb_Brain_Mets/allPatients/Rgb_Brain_Mets_manually_selected"

path_to_dataset = "/Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset"

patientFolders = [
    folder for folder in os.listdir(path_to_mri_sequences) if os.path.isdir(os.path.join(path_to_mri_sequences, folder))
]

list_of_mri_patient_ids: [int] = []

list_of_fit_patients: [config.patient] = [] # patient is both in mri list and in excel list
list_of_unfit_patient_ids: [int] = [] # patient is not in excel list

for patientFolder in tqdm(patientFolders):

    # ignores the ds_folders
    if config.dsStore in patientFolder:
        continue

    patientID = int(patientFolder)
    list_of_mri_patient_ids.append(patientID)

    if patientID in excel_patients_ids:
        # hurray, mri patient is also in the excel patient list

        # To-do:
        # create patient directory
        # create directory for the sequences, maybe store those sequences in a list, to check them out later
        # copy all files into each sequence directory for patient

        patient = config.patient(id = patientID)
        list_of_fit_patients.append(patient)
        
        # creates new folder for patient in the dataset directory
        funcs.createFolderForPatient(path_to_dataset, patientFolder)

        # get list of all sequence names, copy each sequence into new directory
        list_of_sequences = [
            sequenceFolder for sequenceFolder in os.listdir(os.path.join(path_to_mri_sequences, patientFolder)) if os.path.isdir(os.path.join(path_to_mri_sequences, patientFolder, sequenceFolder))
        ]

        for sequenceFolder in list_of_sequences:

            sequenceUnknown = False
            
            # ignores the ds_folders
            if config.dsStore in sequenceFolder:
                continue
            
            # get sequence type from sequenceFolder name
            # e.g. {patientID}_{sequenceType}_{sequenceCount}_{sequenceName}
            sequenceType = sequenceFolder.split("_")[1]
            sequenceName = sequenceFolder.split("_", 3)[3] # this contains "+" instead of " "(space)

            if sequenceType == config.desiredSequences.T1.value:
                
                path_to_sequence_folder = funcs.createSequenceFolder(
                    path = os.path.join(path_to_dataset, patientFolder),
                    patientID = patientFolder,
                    sequence = config.desiredSequences.T1.value,
                    sequence_list = patient.T1_sequences,
                    original_sequence_name = sequenceName
                )

                patient.T1_sequences.append(sequenceName)

            elif sequenceType == config.desiredSequences.T1CE.value:

                path_to_sequence_folder = funcs.createSequenceFolder(
                    path = os.path.join(path_to_dataset, patientFolder),
                    patientID = patientFolder,
                    sequence = config.desiredSequences.T1CE.value,
                    sequence_list = patient.T1CE_sequences,
                    original_sequence_name = sequenceName
                )

                patient.T1CE_sequences.append(sequenceName)

            elif sequenceType == config.desiredSequences.T2.value:

                path_to_sequence_folder = funcs.createSequenceFolder(
                    path = os.path.join(path_to_dataset, patientFolder),
                    patientID = patientFolder,
                    sequence = config.desiredSequences.T2.value,
                    sequence_list = patient.T2_sequences,
                    original_sequence_name = sequenceName
                )

                patient.T2_sequences.append(sequenceName)

            elif sequenceType == config.desiredSequences.FLAIR.value:

                path_to_sequence_folder = funcs.createSequenceFolder(
                    path = os.path.join(path_to_dataset, patientFolder),
                    patientID = patientFolder,
                    sequence = config.desiredSequences.FLAIR.value,
                    sequence_list = patient.FLAIR_sequences,
                    original_sequence_name = sequenceName
                )

                patient.FLAIR_sequences.append(sequenceName)

            elif sequenceType == config.desiredSequences.STERN.value:

                path_to_sequence_folder = funcs.createSequenceFolder(
                    path = os.path.join(path_to_dataset, patientFolder),
                    patientID = patientFolder,
                    sequence = config.desiredSequences.STERN.value,
                    sequence_list = patient.STERN_sequences,
                    original_sequence_name = sequenceName
                )

                patient.STERN_sequences.append(sequenceName)

            elif sequenceType == config.desiredSequences.DWI.value:

                path_to_sequence_folder = funcs.createSequenceFolder(
                    path = os.path.join(path_to_dataset, patientFolder),
                    patientID = patientFolder,
                    sequence = config.desiredSequences.DWI.value,
                    sequence_list = patient.DWI_sequences,
                    original_sequence_name = sequenceName
                )

                patient.DWI_sequences.append(sequenceName)

            elif sequenceType == config.desiredSequences.ADC.value:

                path_to_sequence_folder = funcs.createSequenceFolder(
                    path = os.path.join(path_to_dataset, patientFolder),
                    patientID = patientFolder,
                    sequence = config.desiredSequences.ADC.value,
                    sequence_list = patient.ADC_sequences,
                    original_sequence_name = sequenceName
                )

                patient.ADC_sequences.append(sequenceName)

            elif sequenceType == config.desiredSequences.MPR.value:

                path_to_sequence_folder = funcs.createSequenceFolder(
                    path = os.path.join(path_to_dataset, patientFolder),
                    patientID = patientFolder,
                    sequence = config.desiredSequences.MPR.value,
                    sequence_list = patient.MPR_sequences,
                    original_sequence_name = sequenceName
                )

                patient.MPR_sequences.append(sequenceName)

            else:
                print(f"{patientFolder}:sequence description ({sequenceType}) unknown")
                sequenceUnknown = True

                path_to_sequence_folder = funcs.createSequenceFolder(
                    path = os.path.join(path_to_dataset, patientFolder),
                    patientID = patientFolder,
                    sequence = sequenceType,
                    sequence_list = [],
                    original_sequence_name = sequenceName
                )

            funcs.copyFilesFromDirectoryToNewDirectory(
                path_to_original_directory = os.path.join(path_to_mri_sequences, patientFolder, sequenceFolder),
                path_to_new_directory = path_to_sequence_folder
            )

        patient.T1_amount = len(patient.T1_sequences)
        patient.T1CE_amount = len(patient.T1CE_sequences)
        patient.T2_amount = len(patient.T2_sequences)
        patient.FLAIR_amount = len(patient.FLAIR_sequences)
        patient.STERN_amount = len(patient.STERN_sequences)
        patient.ADC_amount = len(patient.ADC_sequences)
        patient.DWI_amount = len(patient.DWI_sequences)
        patient.MPR_amount = len(patient.MPR_sequences)

        list_of_fit_patients.append(patient)

    else:
        list_of_unfit_patient_ids.append(patientID)
    

  1%|          | 2/319 [00:02<07:25,  1.40s/it]

01199093:sequence description (3DT1CE) unknown


 29%|██▉       | 92/319 [07:34<18:27,  4.88s/it]

01800439:sequence description (3DT1CE) unknown


 55%|█████▌    | 176/319 [15:17<08:08,  3.41s/it]

02066445:sequence description (3DT1CE) unknown


 64%|██████▍   | 204/319 [17:17<11:03,  5.77s/it]

01104996:sequence description (3DT1CE) unknown


 97%|█████████▋| 309/319 [26:14<01:09,  6.94s/it]

02090584:sequence description (3DT1CE) unknown


 97%|█████████▋| 310/319 [26:30<01:26,  9.58s/it]

02089657:sequence description (3DT1CE) unknown


100%|██████████| 319/319 [27:37<00:00,  5.19s/it]


In [None]:
# To-Do
# check duplicate patients
# extract dicom infos, especially date of images
# save .csv file with patient age, sex, primary
# turn into nifti
# get all the patients with T1, T1CE, T2, FLAIR

## Step 4: Create a dataset with the patientID, age, sex and primary

In [4]:
# create .csv file with patient id, age, sex, primary

# get list of excel patients
path_to_patients_csv = "/Users/LennartPhilipp/Desktop/Uni/Prowiss/Code/patientsIDsexbirthdateprimary.csv"
excel_patients = pd.read_csv(path_to_patients_csv)

# get list of patient IDs
excel_patients_ids = excel_patients["ID"].values.astype(int)

path_to_dataset = "/Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset"

# get patient folders
patientFolders = [
    folder for folder in os.listdir(path_to_dataset) if os.path.isdir(os.path.join(path_to_dataset, folder))
]

# dict that holds the patient id and the date of the mri
mri_date_dict = {}

# loop through the patients
for patientFolder in patientFolders:

    # ignores the ds_folders
    if config.dsStore in patientFolder:
        continue

    patientID = int(patientFolder)

    final_mri_date = ""

    # get list of all .txt files
    list_of_txt_files = [
        txt_file for txt_file in os.listdir(os.path.join(path_to_dataset, patientFolder)) if ".txt" in txt_file
    ]

    list_of_mri_dates = []

    # loop through all the .txt files (so all the dicom metadata files)
    for txt_file in list_of_txt_files:

        # get content of metadata file
        try:
            file = open(os.path.join(path_to_dataset, patientFolder, txt_file), "r").readlines()
        except:
            print(f"can't open file at {path_to_dataset} {patientFolder} {txt_file}")
        # get date of mri, format: yyyymmdd

        # extract the date of the mri
        try:
            date_string = ([line for line in file if "(0008, 0012) Instance Creation Date              DA: \'" in line][0].split("(0008, 0012) Instance Creation Date              DA: \'")[1]).split("\'")[0]
        except:
            try:
                date_string = ([line for line in file if "(0008, 0020) Study Date                          DA: \'" in line][0].split("(0008, 0020) Study Date                          DA: \'")[1]).split("\'")[0]
            except:
                date_string = "empty"
            
        list_of_mri_dates.append(date_string)

    # no mri dates for the patient were found
    if not list_of_mri_dates:
        print("is empty!")
        print(patientFolder)

    # returns true if all the dates in the list_of_mri_dates are the same
    if all(x==list_of_mri_dates[0] for x in list_of_mri_dates):
        
        # append the date to the dict
        final_mri_date = list_of_mri_dates[0]
        mri_date_dict[float(patientID)] = final_mri_date

    else:
        print(patientFolder)
        print(list_of_mri_dates)

# drop unnecessary columns
excel_patients = excel_patients.drop(columns=["on_server", "Unnamed: 0", "in_study (0 = no, 1 = yes, 2 = tbd, 3 = remarkable)"])

# add a new column "mri_date" based on the mri_date_dict
excel_patients["mri_date"] = excel_patients["ID"].map(mri_date_dict)

# convert strings to datetime
excel_patients["mri_date"] = pd.to_datetime(excel_patients["mri_date"])
excel_patients["birthdate"] = pd.to_datetime(excel_patients["birthdate"])

# remove possibly empty columns
dataset_patients = excel_patients.loc[excel_patients.mri_date.isna() == False]

# get only unique columns (some patients are listed multiple times, I really don't know why)
dataset_patients = dataset_patients.drop_duplicates(subset=['ID'], keep='first')

# create a dict with patientID : age
age_dict = {}
for index, row in dataset_patients.iterrows():
    id = row["ID"]
    # calculate the age of the patient when the mri was recorded
    age = relativedelta(row["mri_date"], row["birthdate"]).years
    age_dict[id] = age

# add new age column
dataset_patients["age"] = dataset_patients["ID"].map(age_dict)

# remove mri_date and birthdate columns to perserve anonymity
dataset_patients = dataset_patients.drop(columns=["mri_date", "birthdate"])

# export dataset
dataset_patients.to_csv("/Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset/_patientData.tsv", index=False, sep="\t")


can't open file at /Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset 01935938 ._01935938_ADC_Diffusion+trace+tra+schnell_ADC+-+5_metadata.txt
can't open file at /Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset 01400779 ._01400779_T1CE_axial+-T1-halffourier+KM+-+13_metadata.txt
can't open file at /Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset 01006290 ._01006290_ADC_Diffusion+trace+tra+schnell_ADC+-+5_metadata.txt
can't open file at /Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset 01055292 ._01055292_ADC_Diffusion+trace+tra+schnell_ADC+-+12_metadata.txt
can't open file at /Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset 01055292 ._01055292_STERN_T2+stern+flash+tra+schnell+-+13_metadata.txt


In [None]:
# turn sequences to niftis, with new name {patientID}_{sequenceType}, unless there are multiple instances of one sequence or the sequencetype is unknown (3DT1CE)

def convert_dicom_to_nifti (path_to_sequence_folder: str, path_to_output_directory: str, new_filename: str):
    
    converter = Dcm2niix()
    converter.inputs.source_dir = path_to_sequence_folder
    converter.inputs.compress = "y" # uses compression, "y" = yes
    converter.inputs.merge_imgs = True
    converter.inputs.bids_format = True
    converter.inputs.out_filename = new_filename
    converter.inputs.output_dir = path_to_output_directory
    converter.run()
    #print(f"converting: {path_to_sequence_folder}, {path_to_output_directory}, {new_filename} \n")


path_to_dataset = "/Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset"

# get patient folders
patientFolders = [
    folder for folder in os.listdir(path_to_dataset) if os.path.isdir(os.path.join(path_to_dataset, folder))
]

# loop through the patients
for patientFolder in tqdm(patientFolders):

    # ignores the ds_folders
    if config.dsStore in patientFolder:
        continue

    patientID = int(patientFolder)

    patient = config.patient(id = patientID)

    # get list of sequences
    list_of_sequences = [
        sequenceFolder for sequenceFolder in os.listdir(os.path.join(path_to_dataset, patientFolder)) if os.path.isdir(os.path.join(path_to_dataset, patientFolder, sequenceFolder))
    ]

    # add all the sequences to the patient object, to check amount of sequences and thus decide wether to rename file or not
    for sequence in list_of_sequences:

        # ignores the ds_folders
        if config.dsStore in sequence:
            continue

        # get type of sequence and sequence name and add to patient
        sequence_type = sequence.split("_")[1]
        sequence_name = sequence.split("_", 3)[3]

        match sequence_type:
            case config.desiredSequences.T1.value:
                patient.T1_sequences.append(sequence_name)

            case config.desiredSequences.T1CE.value:
                patient.T1CE_sequences.append(sequence_name)

            case config.desiredSequences.T2.value:
                patient.T2_sequences.append(sequence_name)

            case config.desiredSequences.FLAIR.value:
                patient.FLAIR_sequences.append(sequence_name)

            case config.desiredSequences.STERN.value:
                patient.STERN_sequences.append(sequence_name)

            case config.desiredSequences.DWI.value:
                patient.DWI_sequences.append(sequence_name)

            case config.desiredSequences.ADC.value:
                patient.ADC_sequences.append(sequence_name)

            case config.desiredSequences.MPR.value:
                patient.MPR_sequences.append(sequence_name)

            case _:
                print(f"other sequence: {sequence}")
                patient.other_sequences.append(sequence.split("_", 1)[1])

    for sequence in list_of_sequences:

        # ignores the ds_folders
        if config.dsStore in sequence:
            continue

        # get type of sequence and sequence name and add to patient
        sequence_type = sequence.split("_")[1]
        sequence_name = sequence.split("_", 3)[3]

        path_to_output_directory = os.path.join(path_to_dataset, patientFolder)
        path_to_sequence_folder = os.path.join(path_to_dataset, patientFolder, sequence)

        match sequence_type:
            case config.desiredSequences.T1.value:
                
                if len(patient.T1_sequences) == 1:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}.nii.gz
                    new_filename = f"{patientID}_T1"
                    
                else:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}_{sequenceName}.nii.gz
                    new_filename = f"{patientID}_T1_{sequence_name}"

            case config.desiredSequences.T1CE.value:

                if len(patient.T1CE_sequences) == 1:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}.nii.gz
                    new_filename = f"{patientID}_T1CE"
                    
                else:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}_{sequenceName}.nii.gz
                    new_filename = f"{patientID}_T1CE_{sequence_name}"

            case config.desiredSequences.T2.value:

                if len(patient.T2_sequences) == 1:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}.nii.gz
                    new_filename = f"{patientID}_T2"
                    
                else:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}_{sequenceName}.nii.gz
                    new_filename = f"{patientID}_T2_{sequence_name}"

            case config.desiredSequences.FLAIR.value:

                if len(patient.FLAIR_sequences) == 1:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}.nii.gz
                    new_filename = f"{patientID}_FLAIR"
                    
                else:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}_{sequenceName}.nii.gz
                    new_filename = f"{patientID}_FLAIR_{sequence_name}"

            case config.desiredSequences.STERN.value:

                if len(patient.STERN_sequences) == 1:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}.nii.gz
                    new_filename = f"{patientID}_STERN"
                    
                else:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}_{sequenceName}.nii.gz
                    new_filename = f"{patientID}_STERN_{sequence_name}"

            case config.desiredSequences.DWI.value:

                if len(patient.DWI_sequences) == 1:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}.nii.gz
                    new_filename = f"{patientID}_DWI"
                    
                else:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}_{sequenceName}.nii.gz
                    new_filename = f"{patientID}_DWI_{sequence_name}"

            case config.desiredSequences.ADC.value:

                if len(patient.ADC_sequences) == 1:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}.nii.gz
                    new_filename = f"{patientID}_ADC"
                    
                else:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}_{sequenceName}.nii.gz
                    new_filename = f"{patientID}_ADC_{sequence_name}"

            case config.desiredSequences.MPR.value:

                if len(patient.MPR_sequences) == 1:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}.nii.gz
                    new_filename = f"{patientID}_MPR"
                    
                else:
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}_{sequenceName}.nii.gz
                    new_filename = f"{patientID}_MPR_{sequence_name}"
            case _:
                    
                    # turn dicom folder into nifti and rename such as {patientID}_{T1}_{sequenceName}.nii.gz
                    sequence_type_and_name = sequence.split("_", 1)[1]
                    new_filename = f"{patientID}_{sequence_type_and_name}"
            
        convert_dicom_to_nifti(
            path_to_sequence_folder = path_to_sequence_folder,
            path_to_output_directory = path_to_output_directory,
            new_filename =  new_filename)
    

Remove files that are not dicom folders or .txt files

In [8]:
path_to_dataset = "/Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset"

# get patient folders
patientFolders = [
    folder for folder in os.listdir(path_to_dataset) if os.path.isdir(os.path.join(path_to_dataset, folder))
]

# loop through the patients
for patientFolder in tqdm(patientFolders):

    # ignores the ds_folders
    if config.dsStore in patientFolder:
        continue

    files_to_delete = [
        file for file in os.listdir(os.path.join(path_to_dataset, patientFolder)) if not (file.endswith(".txt") or os.path.isdir(os.path.join(path_to_dataset, patientFolder, file)))
    ]

    for file in files_to_delete:
        os.remove(os.path.join(path_to_dataset, patientFolder, file))

100%|██████████| 312/312 [00:01<00:00, 171.23it/s]


## Step 5: Move patients into BIDS directory and convert them to niftis

In [None]:
# To-do:
# delete all newly created nifti files (= keep only folders and .txt files)
# create bids environment
# run above code and create directory according to bids formatting
# get all the patient files with t1, t1ce, t2 and flair
# create both a raw and a processed files directory in there
# do cool ai shit

In [None]:
path_to_bids_entire_dataset = "/Volumes/BrainMets/Rgb_Brain_Mets/brain_mets_regensburg"

path_to_all_patients_dicom = "/Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset_DICOM"

# loop through patients
# create correct directory structure
# turn into niftis and save at new bids dataset location
# important: fix patient ids
# brain_mets_regensburg
# ├── patients.tsv
# ├── sub-12345678
#     ├── anat
#         ├── sub-12345678_T1w.nii.gz
#├── 
#│   └── 


# get patient folders
patientFolders = [
    folder for folder in os.listdir(path_to_all_patients_dicom) if os.path.isdir(os.path.join(path_to_all_patients_dicom, folder))
]

# loop through the patients
for patientFolder in tqdm(patientFolders):

    # ignores the ds_folders
    if config.dsStore in patientFolder:
        continue

    patientID = patientFolder

    patient = config.patient(id = patientID)

    # get list of sequences
    list_of_sequences = [
        sequenceFolder for sequenceFolder in os.listdir(os.path.join(path_to_all_patients_dicom, patientFolder)) if os.path.isdir(os.path.join(path_to_all_patients_dicom, patientFolder, sequenceFolder))
    ]

    # add all the sequences to the patient object, to check amount of sequences and thus decide wether to rename file or not
    for sequence in list_of_sequences:

        # ignores the ds_folders
        if config.dsStore in sequence:
            continue

        # get type of sequence and sequence name and add to patient
        sequence_type = sequence.split("_")[1]
        sequence_name = sequence.split("_", 3)[3]

        match sequence_type:
            case config.desiredSequences.T1.value:
                patient.T1_sequences.append(sequence_name)

            case config.desiredSequences.T1CE.value:
                patient.T1CE_sequences.append(sequence_name)

            case config.desiredSequences.T2.value:
                patient.T2_sequences.append(sequence_name)

            case config.desiredSequences.FLAIR.value:
                patient.FLAIR_sequences.append(sequence_name)

            case config.desiredSequences.STERN.value:
                patient.STERN_sequences.append(sequence_name)

            case config.desiredSequences.DWI.value:
                patient.DWI_sequences.append(sequence_name)

            case config.desiredSequences.ADC.value:
                patient.ADC_sequences.append(sequence_name)

            case config.desiredSequences.MPR.value:
                patient.MPR_sequences.append(sequence_name)

            case _:
                print(f"other sequence: {sequence}")
                patient.other_sequences.append(sequence.split("_", 1)[1])
    
    # create a new patient folder according to the bids standard
    path_to_bids_patient = funcs.createPatientFolderBIDS(path_to_bids_entire_dataset, patientID = patientID)

    #create an anat directory if there are any anatomical sequences
    path_to_anat_directory = f"{path_to_bids_entire_dataset}/{path_to_bids_patient}/anat"
    if patient.T1_sequences or patient.T1CE_sequences or patient.T2_sequences or patient.FLAIR_sequences or patient.STERN_sequences or patient.MPR_sequences or patient.other_sequences:
        # create anat directory
        os.mkdir(path_to_anat_directory)

    #create a dwo directory if there are any anatomical sequences
    path_to_dwi_directory = f"{path_to_bids_entire_dataset}/{path_to_bids_patient}/dwi"
    if patient.DWI_sequences or patient.ADC_sequences:
        # create dwi directory
        os.mkdir(path_to_dwi_directory)

    for sequence in list_of_sequences:

        # ignores the ds_folders
        if config.dsStore in sequence:
            continue

        # get type of sequence and sequence name and add to patient
        sequence_type = sequence.split("_")[1]
        sequence_name = sequence.split("_", 3)[3]

        match sequence_type:
            case config.desiredSequences.T1.value:
                
                if len(patient.T1_sequences) == 1:
                    # copy file with new name here sub-ID_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_T1w"
                    )
                else:
                    # copy file with new name here sub-ID_desc-sequencename_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_desc-{sequence_name}_T1w"
                    )

            case config.desiredSequences.T1CE.value:
                if len(patient.T1CE_sequences) == 1:
                    # copy file with new name here sub-ID_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_T1c"
                    )
                else:
                    # copy file with new name here sub-ID_desc-sequencename_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_desc-{sequence_name}_T1c"
                    )

            case config.desiredSequences.T2.value:
                if len(patient.T2_sequences) == 1:
                    # copy file with new name here sub-ID_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_T2w"
                    )
                else:
                    # copy file with new name here sub-ID_desc-sequencename_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_desc-{sequence_name}_T2w"
                    )

            case config.desiredSequences.FLAIR.value:
                if len(patient.FLAIR_sequences) == 1:
                    # copy file with new name here sub-ID_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_FLAIR"
                    )
                else:
                    # copy file with new name here sub-ID_desc-sequencename_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_desc-{sequence_name}_FLAIR"
                    )

            case config.desiredSequences.STERN.value:
                if len(patient.STERN_sequences) == 1:
                    # copy file with new name here sub-ID_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_T2star"
                    )
                else:
                    # copy file with new name here sub-ID_desc-sequencename_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_desc-{sequence_name}_T2star"
                    )
            
            case config.desiredSequences.MPR.value:
                if len(patient.MPR_sequences) == 1:
                    # copy file with new name here sub-ID_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_MPR"
                    )
                else:
                    # copy file with new name here sub-ID_desc-sequencename_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_desc-{sequence_name}_MPR"
                    )

            case config.desiredSequences.DWI.value:
                if len(patient.DWI_sequences) == 1:
                    # copy file with new name here sub-ID_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_dwi_directory,
                        new_filename = f"sub-{patientID}_dwi"
                    )
                else:
                    # copy file with new name here sub-ID_desc-sequencename_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_dwi_directory,
                        new_filename = f"sub-{patientID}_desc-{sequence_name}_dwi"
                    )

            case config.desiredSequences.ADC.value:
                if len(patient.ADC_sequences) == 1:
                    # copy file with new name here sub-ID_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_dwi_directory,
                        new_filename = f"sub-{patientID}_adc"
                    )
                else:
                    # copy file with new name here sub-ID_desc-sequencename_T1w.nii.gz
                    preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_dwi_directory,
                        new_filename = f"sub-{patientID}_desc-{sequence_name}_adc"
                    )

            case _:
                print(f"other sequence: {sequence}")
                preprocessing.convert_dicom_to_nifti(
                        path_to_sequence_folder = os.path.join(path_to_all_patients_dicom, patientFolder, sequence),
                        path_to_output_directory = path_to_anat_directory,
                        new_filename = f"sub-{patientID}_desc-{sequence_name}_{sequence_type}"
                )

        
    # create folder for patient sub-[patientID]
    # if there is t1, t1ce, t2, flair or mpr sequence create anat directory
    # if there is a dwi or adc sequence create dwi directory
    # rename files accordingly: sub-[patientID]_[sequenceType].nii.gz
    # if there are more than one of a sequencetype, add the name of the sequence as well: sub-[patientID]_desc-[sequenceName]_[sequenceType].nii.gz



add the patients.tsv file to the BIDS directory

In [23]:
import math

def convert_patientID_to_BIDS(id):
    length_of_id = int(math.log10(id))+1
    if length_of_id < 8:
        num_of_zeros = 8 - length_of_id
        bids_subject_id = "sub-" + num_of_zeros * "0" + str(int(id))
    else:
        bids_subject_id = "sub-" + str(int(id))
    return bids_subject_id

path_to_tsf = "/Volumes/BrainMets/Rgb_Brain_Mets/Rgb_Brain_Mets_Dataset_DICOM/_patientData.tsv"

tsf_dataset = pd.read_csv(path_to_tsf, sep="\t")

tsf_dataset["ID"] = tsf_dataset["ID"].apply(convert_patientID_to_BIDS)

tsf_dataset = tsf_dataset.rename(columns={"ID":"participant_id"})

tsf_dataset.to_csv("/Volumes/BrainMets/Rgb_Brain_Mets/brain_mets_regensburg/participants.tsv", index=False, sep="\t")

# function
# if length of patientID < 9:
# add enough 0 before until lenght of patientID == 9


## Step 6: Copy all the patients with T1, T1c, T2 and FLAIR sequences into new directory

copy files

In [18]:
path_to_bids_dataset = "/Volumes/BrainMets/Rgb_Brain_Mets/brain_mets_regensburg"

path_to_classification_dataset = "/Volumes/BrainMets/Rgb_Brain_Mets/brain_mets_classification/rawdata"

# get patient folders
patientFolders = [
    folder for folder in os.listdir(path_to_bids_dataset) if os.path.isdir(os.path.join(path_to_bids_dataset, folder))
]

# loop through the patients
for patientFolder in tqdm(patientFolders):

    # ignores the ds_folders
    if config.dsStore in patientFolder:
        continue

    patientID = patientFolder

    patient = config.patient(id = patientID)

    # get list of sequences
    list_of_sequences = [
        sequence for sequence in os.listdir(os.path.join(path_to_bids_dataset, patientFolder, "anat")) if (".nii" in sequence)
    ]

    for sequence in list_of_sequences:
        if "T1w.nii" in sequence:
            patient.T1_sequences.append(sequence)
        elif "T1c.nii" in sequence:
            patient.T1CE_sequences.append(sequence)
        elif "T2w.nii" in sequence:
            patient.T2_sequences.append(sequence)
        elif "FLAIR.nii" in sequence:
            patient.FLAIR_sequences.append(sequence)

    
    if len(patient.T1_sequences) and len(patient.T1CE_sequences) and len(patient.T2_sequences) and len(patient.FLAIR_sequences) > 0:

        # create new patient directory
        funcs.createFolderForPatient(path = path_to_classification_dataset, patientID = patientID)

        # create anat file at patient directory
        anat_dir_created = False
        path_to_anat_directory = f"{path_to_classification_dataset}/{patientFolder}/anat"
        if not anat_dir_created:
            os.mkdir(path_to_anat_directory)
            anat_dir_created = True


        for sequence_list in [patient.T1_sequences, patient.T1CE_sequences, patient.T2_sequences, patient.FLAIR_sequences]:
            if len(sequence_list) > 1:
                print(f"check duplicates: {sequence_list}")
            
            for sequence in sequence_list:
                # copy file to new directory
                # shutil.copy(src, dir)
                path_to_source = os.path.join(path_to_bids_dataset, patientFolder, "anat", sequence)
                path_to_destination_directory = os.path.join(path_to_classification_dataset, patientFolder, "anat")
                shutil.copy(path_to_source, path_to_destination_directory)
                #print(f"{path_to_source} TO {path_to_destination_directory}")
    
    

  0%|          | 0/312 [00:00<?, ?it/s]

100%|██████████| 312/312 [00:26<00:00, 11.97it/s]


makes sure that there are 4 sequences for each patient

In [None]:
path_to_classification_dataset = "/Volumes/BrainMets/Rgb_Brain_Mets/brain_mets_classification/rawdata"

# get patient folders
patientFolders = [
    folder for folder in os.listdir(path_to_classification_dataset) if os.path.isdir(os.path.join(path_to_classification_dataset, folder))
]

# loop through the patients
for patientFolder in tqdm(patientFolders):

    # ignores the ds_folders
    if config.dsStore in patientFolder:
        continue
    
    list_of_sequences = [
        sequence for sequence in os.listdir(os.path.join(path_to_classification_dataset, patientFolder, "anat")) if (".nii" in sequence)
    ]

    if len(list_of_sequences) > 4:
        print(f"#{len(list_of_sequences)} {patientFolder}, {list_of_sequences}")
    elif len(list_of_sequences) < 4:
        print(f"#{len(list_of_sequences)} {patientFolder}, {list_of_sequences}")
    elif len(list_of_sequences) == 4:
        print(f"{patientFolder}: all good")

copy patientlist.tsv with only the right patients

In [27]:
path_to_classification_dataset = "/Volumes/BrainMets/Rgb_Brain_Mets/brain_mets_classification/rawdata"

path_to_tsf = "/Volumes/BrainMets/Rgb_Brain_Mets/brain_mets_regensburg/participants.tsv"

all_patients = pd.read_csv(path_to_tsf, sep="\t")

# get patient folders
patientIds = [
    folder for folder in os.listdir(path_to_classification_dataset) if os.path.isdir(os.path.join(path_to_classification_dataset, folder))
]

classification_patients = all_patients[all_patients["participant_id"].isin(patientIds)]

classification_patients.to_csv("/Volumes/BrainMets/Rgb_Brain_Mets/brain_mets_classification/rawdata/participants.tsv", index=False, sep="\t")
