In [1]:
import os
import sys
sys.path.append(r"/Users/LennartPhilipp/Desktop/Uni/Prowiss/Code/Brain_Mets_Classification")

from tqdm import tqdm
from datetime import datetime
import shutil
import pandas as pd

import brain_mets_classification.config as config
import brain_mets_classification.custom_funcs as funcs

In [2]:
path_to_folder = f"{config.path_to_ssd}/originalPatientFiles"
#path_to_folder = f"{config.path_to_n30}/RgbBrainMetsSampleN30"
pathToCleanMRIList = ""

### Step 0: Removes sequences based on the blacklist

In [3]:
#Example file structure
#├── Anonymized - 01005097
#│   └── Mrt Body
#│       ├── Diffusion trace tra schnell_ADC - 8
#│       │   ├── IM-2330-0001-0001.dcm
#│       │   ├── IM-2330-0002-0001.dcm
#...
#│       │   ├── IM-2469-0026-0001.dcm
#│       │   └── IM-2469-0027-0001.dcm
#│       ├── Diffusion trace tra schnell_TRACEW - 7
#│       │   ├── IM-2329-0001-0001.dcm
#│       │   ├── IM-2329-0002-0001.dcm
#...
#│       │   ├── IM-2468-0026-0001.dcm
#│       │   └── IM-2468-0027-0001.dcm
#│       ├── T1 mp-rage3d we sag 1mm KM - 13
#│       │   ├── IM-2335-0001-0001.dcm
#│       │   ├── IM-2335-0002-0001.dcm
#...
#│       │   ├── IM-2474-0159-0001.dcm
#│       │   └── IM-2474-0160-0001.dcm
#├── Anonymized - 12345678
#...

blackList = ["auswertung_fmrt",
             "fmri",
             "thorax",
             "lws", "hws", "bws",
             "hand",
             "posdisp",
             "cor", "sag",
             "cest",
             "ciss",
             "dti",
             "evidence", "reading",
             "field",
             "evaseries",
             "ct",
             "lokalizer", "localizer",
             "mip", "protocol", "resolve", "results", "screen save", "sub", "svs", "tof", "mean_", "ws", "leakage", "lunge"]

# Creates a new directory for all the patient folders
pathToCleanMRIList = funcs.createNewPreprocessingStepFolder("0_blacklist")

# Goes through list of files/folders at path_to_folder and only adds the directories to the list
folderList = [
    folder for folder in os.listdir(path_to_folder) if os.path.isdir(os.path.join(path_to_folder, folder))
]

patientIDs = []

sequencesList = []


# Loops through all the "Anonymized - #######" folders
for patient_folder in tqdm(folderList):

    # ignores the ds_folders
    if config.dsStore in patient_folder:
        continue

    # all folders are named like "Anonymized - 12345678"
    patientID = patient_folder.split(" - ")[1]

    # adds the patientID to the list patientIDS if it hasn't been added before
    if patientID not in patientIDs:
        patientIDs.append(patientID)

    path_to_MRI_session_folders = os.path.join(path_to_folder, patient_folder)
    MRI_session_folders = os.listdir(path_to_MRI_session_folders)

    # loops through the different MRI sessions
    for mri_session in MRI_session_folders:
        
        # ignores the ds_folders
        if config.dsStore in mri_session:
            continue

        # get a list of all the sequences in the mri_sequences
        path_to_mri_sequences = os.path.join(path_to_MRI_session_folders, mri_session)
        mri_sequences = os.listdir(path_to_mri_sequences)

        # loops through the different sequences created during each MRI session
        for sequence in mri_sequences:

            # ignores the ds_folders
            if config.dsStore in sequence:
                continue

            sequences_lower_cased = sequence.lower()

            if not any(blackListSeq in sequences_lower_cased for blackListSeq in blackList):

                # create folder for patient
                funcs.createFolderForPatient(
                    path = pathToCleanMRIList,
                    patientID = patientID
                )

                # # create new folder as pathToCleanMRIList/patientID/T1CE
                # path_to_sequence = funcs.createSequenceFolder(
                #     path = os.path.join(pathToCleanMRIList, patientID),
                #     patientID = patientID,
                #     sequence = "",
                #     sequence_list = [],
                #     original_sequence_name = sequence
                # )

                pathToPatient = os.path.join(pathToCleanMRIList, patientID)
                folderName = f"{patientID}_{sequence}"
                path_to_sequence = os.path.join(pathToPatient, folderName)
                # check if path_to_sequence already exists then create directory
                sequences = os.listdir(pathToPatient)
                counter = sequences.count(folderName)
                if counter >= 1:
                    path_to_sequence = f"{path_to_sequence}{counter + 1}"
                    os.mkdir(path_to_sequence)
                else:
                    os.mkdir(path_to_sequence)

                # get list of all the dicom files for the T1CE sequence
                dicomFiles = os.listdir(os.path.join(path_to_mri_sequences, sequence))

                # loops through the list of dicom files
                for dicomFile in dicomFiles:
                    # ignores the ds_folders
                    if config.dsStore in dicomFile:
                        continue

                    # copy each file individually into the path_to_sequence folder
                    shutil.copyfile(os.path.join(path_to_mri_sequences, sequence, dicomFile), os.path.join(path_to_sequence, dicomFile))



#             sequencesList.append(sequences_lower_cased)

# cleanList = list(dict.fromkeys(sorted(sequencesList)))

# minusNumberList = []

# for sequence in cleanList:
#     sequenceWithoutNumber = sequence.split(" - ")[0]
#     minusNumberList.append(sequenceWithoutNumber)

# cleanMinusNumber = list(dict.fromkeys(minusNumberList))









# removedSequences = []

# for sequence in cleanMinusNumber:
#     if not any(backListSequ in sequence for backListSequ in blackList): #and any(whiteListSeq in sequence for whiteListSeq in whiteList):
#         print(sequence)
#     else:
#         removedSequences.append(sequence)

# print(f"\n\nRemoved Sequences: {len(removedSequences)}")
# for removedSequence in removedSequences:
#     print(removedSequence)

100%|██████████| 373/373 [07:29<00:00,  1.21s/it]


### Step 0.5: removes the patients with less than 4 sequences from the previously created folder

In [4]:
filterFolderList = [
    folder for folder in os.listdir(pathToCleanMRIList) if os.path.isdir(os.path.join(pathToCleanMRIList, folder))
]

# loops through all the "12345678" folders
for patient_folder in tqdm(filterFolderList):

    # ignores the ds_folders
    if config.dsStore in patient_folder:
        continue
    
    path_to_patient = os.path.join(pathToCleanMRIList, patient_folder)

    # list to the mri sessions folders
    mri_sessions = os.listdir(path_to_patient)

    if len(mri_sessions) < 4:
        os.system(f"rm -r {path_to_patient}")

100%|██████████| 369/369 [00:01<00:00, 247.42it/s]


In [5]:
#print(pathToCleanMRIList)
pathToCleanMRIList = "/Volumes/BrainMets/Rgb_Brain_Mets/allPatients/Rgb_Brain_Mets_Preprocessing_0_blacklist_20240120-155725"

### Step 1: Filters the different Sequences

In [5]:
pathToPreprocess1 = funcs.createNewPreprocessingStepFolder("01_sequencesFiltered")

T1List = ["t1"]
T2List = ["t2", "flair"]
T2SternList = ["stern", "hemo", "blutung", "*", "hämo"]
FLAIRList = ["flair"]
KMList = ["km"]
ADCList = ["adc"]
DWIList = ["diffusion", "diff", "adc", "dwi"]
MPRList = ["mpr"]

# list of patientIDs
folderPreprocess0List = [
    folder for folder in os.listdir(pathToCleanMRIList) if os.path.isdir(os.path.join(pathToCleanMRIList, folder))
]

# list of list of sequences for the DataFrame
dfpatientIDs = [] # list of ints
dfT1_sequences = [] # list of strings
dfT1_amount = [] # list of ints
dfT1CE_sequences = [] # list of strings
dfT1CE_amount = [] # list of ints
dfT2_sequences = [] # list of strings
dfT2_amount = [] # list of ints
dfFLAIR_sequences = [] # list of strings
dfFLAIR_amount = [] # list of ints
dfSTERN_sequences = [] # list of strings
dfSTERN_amount = [] # list of ints
dfDWI_sequences = [] # list of strings
dfDWI_amount = [] # list of ints
dfADC_sequences = [] # list of strings
dfADC_amount = [] # list of ints
dfMPR_sequences = [] # list of strings
dfMPR_amount = [] # list of ints
dfrejected_sequences = [] # list of strings
dfrejected_amount = [] # list of ints
df_has_rejected = [] # list of bools
df_has_duplicates = [] # list of bools

dfPatients = [[]]

# Loops through all the "#######" folders
for patient_folder in tqdm(folderPreprocess0List):

    T1_sequences = []
    T1CE_sequences = []
    T2_sequences = []
    FLAIR_sequences = []
    STERN_sequences = []
    DWI_sequences = []
    ADC_sequences = []
    MPR_sequences = []

    rejected_sequences = []

    has_duplicates = False
    has_rejected = False

    patientID = patient_folder

    # ignores the ds_folders
    if config.dsStore in patient_folder:
        continue
    
    path_to_patient_sequences = os.path.join(pathToCleanMRIList, patient_folder)
    mri_sequences = os.listdir(path_to_patient_sequences)

    # create new folder for patient in pathToPreprocess1

    # loop through the sequences for each patient
    for sequence in mri_sequences:

        # only get the sequence name, folder names: "12345678_SEQUENCENAME"
        sequence_name = sequence.split("_", 1)[1]
        sequence_lower_cased = sequence_name.lower()

        # create folder for patient
        funcs.createFolderForPatient(
            path = pathToPreprocess1,
            patientID = patientID
        )

        #T1
        if any(t1Sequence in sequence_lower_cased for t1Sequence in T1List):
            # sequence contains "t1" in its name
            # check if it's T1CE or not, if not it's regular t1, if it is then it's a T1CE sequence
            if any(KMSequence in sequence_lower_cased for KMSequence in KMList):
                # it's a T1CE sequence
                # rename like this: {patientID}_T1CE-{sequence}

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.T1CE.value,
                    sequence_list = T1CE_sequences,
                    original_sequence_name = sequence_name)
                
                T1CE_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)

            else:
                # it's a normale T1 Sequence
                # rename like this: {patientID}_T1-{sequence}

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.T1.value,
                    sequence_list = T1_sequences,
                    original_sequence_name = sequence_name)
                
                T1_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)
                
        elif any(t2Sequence in sequence_lower_cased for t2Sequence in T2List):
            # sequence containts "t2" in its name
            # check if it's a FLAIR, a STAR or a regular T2 sequence

            if any(flairSequence in sequence_lower_cased for flairSequence in FLAIRList):
                # it's a FLAIR sequence
                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.FLAIR.value,
                    sequence_list = FLAIR_sequences,
                    original_sequence_name = sequence_name)
                
                FLAIR_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)

            elif any(sternSequence in sequence_lower_cased for sternSequence in T2SternList):
                # it's a STERN sequence

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.STERN.value,
                    sequence_list = STERN_sequences,
                    original_sequence_name = sequence_name)
                
                STERN_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)
                
            else:
                # it's a normale T2 sequence

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.T2.value,
                    sequence_list = T2_sequences,
                    original_sequence_name = sequence_name)
                
                T2_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)
        
        elif any(diffSequence in sequence_lower_cased for diffSequence in DWIList):
            # sequence contains "diffusion" or "dif" in it's name
            
            if any(adcSequence in sequence_lower_cased for adcSequence in ADCList):
                # it's a adc sequence

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.ADC.value,
                    sequence_list = ADC_sequences,
                    original_sequence_name = sequence_name)
                
                ADC_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)
                
            else:
                # it's a normal dif sequence

                # create new folder in preprocessing1/patientID/folder name above
                path_to_sequence = funcs.createSequenceFolder(
                    path = f"{pathToPreprocess1}/{patientID}",
                    patientID =  patientID,
                    sequence = config.desiredSequences.DWI.value,
                    sequence_list = DWI_sequences,
                    original_sequence_name = sequence_name)
                
                DWI_sequences.append(sequence_name)

                # copy files to new folder
                funcs.copyFilesFromDirectoryToNewDirectory(
                    path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                    path_to_new_directory = path_to_sequence)

        elif any(mprsequence in sequence_lower_cased for mprsequence in MPRList):
            # it's a mpr sequence

            # create new folder in preprocessing1/patientID/folder name above
            path_to_sequence = funcs.createSequenceFolder(
                path = f"{pathToPreprocess1}/{patientID}",
                patientID =  patientID,
                sequence = config.desiredSequences.MPR.value,
                sequence_list = MPR_sequences,
                original_sequence_name = sequence_name)
                
            MPR_sequences.append(sequence_name)

            # copy files to new folder
            funcs.copyFilesFromDirectoryToNewDirectory(
                path_to_original_directory =  os.path.join(path_to_patient_sequences, sequence),
                path_to_new_directory = path_to_sequence)

        else:
            # file got rejected
            rejected_sequences.append(sequence_name)

            # get the number of patients that have all the sequences or only some etc.
            # get the exact patients that have multiple sequences
    

# dfpatientIDs = []
# dfT1_sequences = []
# dfT1_amount = 0
# dfT1CE_sequences = []
# dfT1_amount = 0
# dfT2_sequences = []
# dfT1_amount = 0
# dfFLAIR_sequences = []
# dfT1_amount = 0
# dfSTERN_sequences = []
# dfT1_amount = 0
# dfDWI_sequences = []
# dfT1_amount = 0
# dfADC_sequences = []
# dfT1_amount = 0
# dfrejected_sequences = []
# dfT1_amount = 0
# df_has_rejected = False
# df_has_duplicates = False

    # code for the dataframe goes here
            
    list_of_sequences = [T1_sequences, T1CE_sequences, T2_sequences, FLAIR_sequences, STERN_sequences, DWI_sequences, ADC_sequences, MPR_sequences]
    
    dfpatientIDs.append(patientID)

    dfT1_sequences.append(T1_sequences)
    dfT1_amount.append(len(T1_sequences))

    dfT1CE_sequences.append(T1CE_sequences)
    dfT1CE_amount.append(len(T1CE_sequences))

    dfT2_sequences.append(T2_sequences)
    dfT2_amount.append(len(T2_sequences))

    dfFLAIR_sequences.append(FLAIR_sequences)
    dfFLAIR_amount.append(len(FLAIR_sequences))

    dfSTERN_sequences.append(STERN_sequences)
    dfSTERN_amount.append(len(STERN_sequences))

    dfDWI_sequences.append(DWI_sequences)
    dfDWI_amount.append(len(DWI_sequences))

    dfADC_sequences.append(ADC_sequences)
    dfADC_amount.append(len(ADC_sequences))

    dfMPR_sequences.append(MPR_sequences)
    dfMPR_amount.append(len(MPR_sequences))

    dfrejected_sequences.append(rejected_sequences)
    dfrejected_amount.append(len(rejected_sequences))

    # has duplicates
    for sequenceList in list_of_sequences:
        if len(sequenceList) > 1:
            has_duplicates = True

    df_has_duplicates.append(has_duplicates)

    # has rejected
    if rejected_sequences:
        has_rejected = True
    
    df_has_rejected.append(has_rejected)

    #dfPatients.append([len(T1_sequences), T1_sequences, len(T1CE_sequences), T1CE_sequences, len(T2_sequences), T2_sequences, len(FLAIR_sequences), FLAIR_sequences, len(STERN_sequences), STERN_sequences, len(DWI_sequences), DWI_sequences, len(ADC_sequences), ADC_sequences, len(rejected_sequences), rejected_sequences, has_rejected, has_duplicates])
    

IndentationError: unexpected indent (25539059.py, line 232)

#### Creates a Dataframe for the patients

In [6]:
# create patient Dataframe

patientsDataFrame = pd.DataFrame(list(zip(
             dfT1_amount,
             dfT1_sequences,
             dfT1CE_amount,
             dfT1CE_sequences,
             dfT2_amount,
             dfT2_sequences,
             dfFLAIR_amount,
             dfFLAIR_sequences,
             dfSTERN_amount,
             dfSTERN_sequences,
             dfDWI_amount,
             dfDWI_sequences,
             dfADC_amount,
             dfADC_sequences,
             dfMPR_amount,
             dfMPR_sequences,
             dfrejected_amount,
             dfrejected_sequences,
             df_has_rejected,
             df_has_duplicates)),
    columns = ["T1 amount",
               "T1 sequences",
               "T1CE amount",
               "T1CE sequences",
               "T2 amount",
               "T2 sequences",
               "FLAIR amount",
               "FLAIR sequences",
               "STERN amount",
               "STERN sequences",
               "DWI amount",
               "DWI sequences",
               "ADC amount",
               "ADC seqeunces",
               "MPR amount",
               "MPR sequences",
               "rejected amount",
               "rejected sequences",
               "has rejected",
               "has duplicates"],
    index = dfpatientIDs)

saves the Dataframe as a .csv file

In [8]:
patientsDataFrame.to_csv(f"{config.path_to_ssd}/patientsDataframe.csv")

get the Dataframe from the .csv file

In [3]:
patientsDataFrame = pd.read_csv(f"{config.path_to_ssd}/patientsDataframe.csv")

Display statistics about the patients dataframe

In [7]:
# To-Do:
# get amount of patients with T1, T1CE, T2, FLAIR sequences
# get amount of patients with T1, T1CE, T2, FLAIR + STERN sequences
# get amount of patients with T1, T1CE, T2, FLAIR + STERN + DWI + ADC sequences
# get patient IDs of patients with duplicates
# get patient IDs of patients with rejections
# check rejected files

# get amount of patients with T1, T1CE, T2, FLAIR sequences
patient4Seq = patientsDataFrame.query("`T1 amount` > 0 and `T1CE amount` > 0 and `T2 amount` > 0 and `FLAIR amount` > 0")
patientCount4Seq = len(patient4Seq)
print(f"Amount of patients with 4 sequences: {patientCount4Seq}")

# get amount of patients with T1, T1CE, T2, FLAIR + STERN sequences
patientCount5Seq = len(patientsDataFrame.query("`T1 amount` > 0 and `T1CE amount` > 0 and `T2 amount` > 0 and `FLAIR amount` > 0 and `STERN amount` > 0"))
print(f"Amount of patients with the 4 sequences + STERN: {patientCount5Seq}")

# get amount of patients with T1, T1CE, T2, FLAIR + DWI + ADC sequences
patientCount6Seq = len(patientsDataFrame.query("`T1 amount` > 0 and `T1CE amount` > 0 and `T2 amount` > 0 and `FLAIR amount` > 0 and `DWI amount` > 0 and `ADC amount` > 0"))
print(f"Amount of patients with the 4 sequences + DWI + ADC: {patientCount6Seq}")

# get amount of patients with T1, T1CE, T2, FLAIR + STERN + DWI + ADC sequences
patientCount7Seq = len(patientsDataFrame.query("`T1 amount` > 0 and `T1CE amount` > 0 and `T2 amount` > 0 and `FLAIR amount` > 0 and `STERN amount` > 0 and `DWI amount` > 0 and `ADC amount` > 0"))
print(f"Amount of patients with the 4 sequences + STERN + DWI + ADC: {patientCount7Seq}")

# get patient IDs of patients with duplicates
patientIDsWithDuplicates = patientsDataFrame[patientsDataFrame["has duplicates"] == True].index
print(f"List of patientIDs that have duplicates (#{len(patientIDsWithDuplicates)})")
for patientID in patientIDsWithDuplicates:
    print(patientID)


# get patient IDs of patients with rejections
patientIDsWithRejectedSequences = patientsDataFrame[patientsDataFrame["has rejected"] == True].index
print(f"List of patientIDs that have rejected Sequences (#{len(patientIDsWithRejectedSequences)})")
for patientID in patientIDsWithRejectedSequences:
    print(patientID)

Amount of patients with 4 sequences: 27
Amount of patients with the 4 sequences + STERN: 24
Amount of patients with the 4 sequences + DWI + ADC: 24
Amount of patients with the 4 sequences + STERN + DWI + ADC: 22
List of patientIDs that have duplicates (#2)
01199093
01321873
List of patientIDs that have rejected Sequences (#5)
01199093
01134825
01111974
01321873
01108350


## Step 2: Remove " " from the folder names as dicom2niix doesn't seam to work with spaces in the folder name

In [28]:
# To-Do:
# go through each patient folder and replace " " with "+"


#path_to_filtered_patient_files = os.path.join(config.path_to_n30, "/Users/LennartPhilipp/Desktop/Uni/Prowiss/Code/Brain_Mets_Classification/Rgb_Brain_Mets_Dataset/N30/Rgb_Brain_Mets_Preprocessing_01_sequencesFiltered_20240118-171818")
path_to_filtered_patient_files = pathToPreprocess1

patient_folders = [
    folder for folder in os.listdir(path_to_filtered_patient_files) if os.path.isdir(os.path.join(path_to_filtered_patient_files, folder))
]

for patient_folder in tqdm(patient_folders):

    # ignores the ds_folders
    if config.dsStore in patient_folder:
        continue

    patientID = patient_folder

    path_to_patient = os.path.join(path_to_filtered_patient_files, patient_folder)

    # get the different sequence folders for each patient
    sequences = [
        folder for folder in os.listdir(path_to_patient) if os.path.isdir(os.path.join(path_to_patient, folder))
    ]

    for sequence in sequences:

        # ignores the ds_folders
        if config.dsStore in sequence:
            continue

        newFolderName = sequence.replace(" ", "+")
        os.rename(os.path.join(path_to_patient, sequence), os.path.join(path_to_patient, newFolderName))

100%|██████████| 27/27 [00:00<00:00, 1338.69it/s]


## Number comparison between the first search and the second search

First Search:
- Amount of patients with 4 sequences: 282
- Amount of patients with the 4 sequences + STERN: **199**
- Amount of patients with the 4 sequences + DWI + ADC: 273
- Amount of patients with the 4 sequences + STERN + DWI + ADC: **197**
- List of patientIDs that have duplicates (#**67**)

Second Search:
- Amount of patients with 4 sequences: 282
- Amount of patients with the 4 sequences + STERN: **205**
- Amount of patients with the 4 sequences + DWI + ADC: 273
- Amount of patients with the 4 sequences + STERN + DWI + ADC: **202**
- List of patientIDs that have duplicates (#**63**)
- List of patientIDs that have rejected Sequences (#80)

To-Do:
1. [ ] Create two Dataframes and compare them with eachother
2. [ ] Report to Quirin
3. [ ] Go to Preprocessing

In [14]:
path_to_1st_DF = "/Volumes/BrainMets/Rgb_Brain_Mets/allPatients/patientsDataframe0.csv"
path_to_1st_modified_Data = "/Volumes/BrainMets/Rgb_Brain_Mets/allPatients/Rgb_Brain_Mets_Preprocessing1_20240111-215606"
path_to_2nd_DF = "/Volumes/BrainMets/Rgb_Brain_Mets/allPatients/patientsDataframe.csv"

In [13]:
first_DF = pd.read_csv(path_to_1st_DF)
second_DF = pd.read_csv(path_to_2nd_DF)

Create a Dataframe based on the patient files

In [25]:
patient_folders = [
    folder for folder in os.listdir(path_to_1st_modified_Data) if os.path.isdir(os.path.join(path_to_1st_modified_Data, folder))
]

patientIDList = []
patientsLists = [[]]

for patient_folder in tqdm(patient_folders):

    # ignores the ds_folders
    if config.dsStore in patient_folder:
        continue

    T1_sequences = []
    T1_amount = []
    T1CE_sequences = []
    T1CE_amount = []
    T2_sequences = []
    T2_amount = []
    FLAIR_sequences = []
    FLAIR_amount = []
    STERN_sequences = []
    STERN_amount = []
    DWI_sequences = []
    DWI_amount = []
    ADC_sequences = []
    ADC_amount = []
    MPR_sequences = []
    MPR_amount = []

    has_duplicates = False

    # cannot read the rejected sequences as they're not in the folders

    patientID = patient_folder

    path_to_patient_sequences = os.path.join(path_to_1st_modified_Data, patient_folder)
    mri_sequences = os.listdir(path_to_patient_sequences)

    for sequence in mri_sequences:

        if config.dsStore in sequence:
            continue

        # each sequences is written like this "{patientID}_{sequenceType}_{amountOfSequence}_{NameOfSequence}"

        sequenceType = sequence.split("_")[1] # like T1, T1CE, T2...
        sequenceName = sequence.split("_", 3)[3]
        #print(f"{sequenceType}: {sequenceName}")

        if sequenceType == config.desiredSequences.T1.value:
            # T1
            T1_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.T1CE.value:
            # T1CE
            T1CE_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.T2.value:
            # T2
            T2_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.FLAIR.value:
            # FLAIR
            FLAIR_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.STERN.value:
            # STERN
            STERN_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.DWI.value:
            # DWI
            DWI_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.ADC.value:
            # ADC
            ADC_sequences.append(sequenceName)
        elif sequenceType == config.desiredSequences.MPR.value:
            # MPR
            MPR_sequences.append(sequenceName)
        else:
            print(f"Error: couldn't match sequence name {sequenceName} to sequence type")
    
    patientIDList.append(patientID)

    T1_amount = len(T1_sequences)
    T1CE_amount = len(T1CE_sequences)
    T2_amount = len(T2_sequences)
    FLAIR_amount = len(FLAIR_sequences)
    STERN_amount = len(STERN_sequences)
    DWI_amount = len(DWI_sequences)
    ADC_amount = len(ADC_sequences)
    MPR_amount = len(MPR_sequences)

    for sequenceAmounts in [T1_amount, T1CE_amount, T2_amount, FLAIR_amount, STERN_amount, DWI_amount, ADC_amount, MPR_amount]:
        if sequenceAmounts > 1:
            has_duplicates = True

    patientsLists.append([T1_amount, T1_sequences,
                          T1CE_amount, T1CE_sequences,
                          T2_amount, T2_sequences,
                          FLAIR_amount, FLAIR_sequences,
                          STERN_amount, STERN_sequences,
                          DWI_amount, DWI_sequences,
                          ADC_amount, ADC_sequences,
                          MPR_amount, MPR_sequences,
                          has_duplicates])

patientsLists.pop(0)

first_modified_data_DF = pd.DataFrame(patientsLists,
                                      columns = ["T1 amount", "T1 sequences",
                                                 "T1CE amount", "T1CE sequences",
                                                 "T2 amount", "T2 sequences",
                                                 "FLAIR amount", "FLAIR sequences",
                                                 "STERN amount", "STERN sequences",
                                                 "DWI amount", "DWI sequences",
                                                 "ADC amount", "ADC sequences",
                                                 "MPR amount", "MPR sequences",
                                                 "has duplicates"],
                                       index = patientIDList)

100%|██████████| 321/321 [00:00<00:00, 5819.20it/s]
