# Get old MRI sequences and move them into correct directory

In [40]:
import os
from tqdm import tqdm

import sys
sys.path.append(r"/Users/LennartPhilipp/Desktop/Uni/Prowiss/Code/Brain_Mets_Classification")

import shutil
import pandas as pd
import math
from dateutil.relativedelta import relativedelta

import brain_mets_classification.config as config
import brain_mets_classification.custom_funcs as funcs

## Copy patient files into new directory

In [26]:
path_to_old_patients = "/Volumes/BrainMets/Rgb_Brain_Mets/Regensburg_BrainMetDataset_raw_20240124"
# files are named like this
# [PATIENT ID]_[DATE OF RECORDING]/[PATIENT ID]_[DATE OF RECORDING]_[SEQUENCE TYPE].nii.gz

path_to_bids_brain_mets_regensburg = "/Volumes/BrainMets/Rgb_Brain_Mets/bids_old_patients"
# file naming scheme: sub-[PATIENT ID]/anat/sub-[PATIENT ID]_[SEQUENCE TYPE].nii.gz

path_to_brain_mets_classification = "/Volumes/BrainMets/Rgb_Brain_Mets/brain_mets_classification/rawdata"

In [20]:
# get patient folders
patientFolders = [
    folder for folder in os.listdir(path_to_old_patients) if os.path.isdir(os.path.join(path_to_old_patients, folder))
]

mri_date_dict = {}

# loop through the patients
for patientFolder in tqdm(patientFolders):

    # ignores the ds_folders
    if config.dsStore in patientFolder:
        continue
    
    patientID = patientFolder.split("_")[0]
    time_of_recording = patientFolder.split("_")[1] #yyyymmddhhmmss
    mri_date_dict[patientID] = time_of_recording

    patient = config.patient(id = patientID)

    # create new folder for patient in path_to_bids
    bids_patient = funcs.createPatientFolderBIDS(path = os.path.join(path_to_bids_brain_mets_regensburg), patientID = patientID)

    # create anat folder within patientFolder
    path_to_anat_directory = f"{path_to_bids_brain_mets_regensburg}/{bids_patient}/anat"
    os.mkdir(path_to_anat_directory)

    # get list of sequences
    list_of_sequences = [
        sequence for sequence in os.listdir(os.path.join(path_to_old_patients, patientFolder)) if (".nii" in sequence and not sequence.startswith("."))
    ]

    for sequence in list_of_sequences:
        # new name of sequences: T1, T1c, T2, FLAIR
        # new file names: sub-[patientID]_[sequenceType].nii.gz
        # only copy T1, T1c, T2 and FLAIR
        if "FLAIR" in sequence:
            new_file_name = f"sub-{patientID}_FLAIR.nii.gz"
            path_to_source = f"{path_to_old_patients}/{patientFolder}/{sequence}"
            path_to_destination = f"{path_to_anat_directory}/{new_file_name}"

        elif "T2" in sequence:
            new_file_name = f"sub-{patientID}_T2w.nii.gz"
            path_to_source = f"{path_to_old_patients}/{patientFolder}/{sequence}"
            path_to_destination = f"{path_to_anat_directory}/{new_file_name}"

        elif "T1CE" in sequence:
            new_file_name = f"sub-{patientID}_T1c.nii.gz"
            path_to_source = f"{path_to_old_patients}/{patientFolder}/{sequence}"
            path_to_destination = f"{path_to_anat_directory}/{new_file_name}"

        elif "T1" in sequence:
            new_file_name = f"sub-{patientID}_T1w.nii.gz"
            path_to_source = f"{path_to_old_patients}/{patientFolder}/{sequence}"
            path_to_destination = f"{path_to_anat_directory}/{new_file_name}"

        else: # MPRAGE sequence can be skipped
            continue

        shutil.copy(path_to_source, path_to_destination)

100%|██████████| 231/231 [00:21<00:00, 10.77it/s]


Manually copied all the files into the brain_mets_classification/rawdata directory

## Get patient infos and update the .tsv file

In [54]:
path_to_patients_csv = "/Users/LennartPhilipp/Desktop/Uni/Prowiss/Code/oldpatientsIDsexbirthdateprimary.csv"
excel_patients = pd.read_csv(path_to_patients_csv)

path_to_tsv = "/Volumes/BrainMets/Rgb_Brain_Mets/brain_mets_classification/rawdata/participants.tsv"
tsv_patients = pd.read_csv(path_to_tsv, sep="\t")

# get the ids of all the patients that are already in the .tsv file
tsv_patient_ids = tsv_patients["participant_id"].values

# array to store all the patient ids that arent in the .tsv file
new_patient_ids = []

# get list of all the patients that are not yet in the tsv file
# get patient folders
patientFolders = [
    folder for folder in os.listdir(path_to_brain_mets_classification) if os.path.isdir(os.path.join(path_to_brain_mets_classification, folder))
]

for folder in tqdm(patientFolders):

    # ignores the ds_folders
    if config.dsStore in patientFolder:
        continue

    patientID = folder

    if not patientID in tsv_patient_ids:
        new_patient_ids.append(patientID)

print(len(new_patient_ids))

# find out the age that they were when the mri was taken
excel_patients = excel_patients.drop(columns=["on_server", "Unnamed: 0", "in_study (0 = no, 1 = yes, 2 = tbd, 3 = remarkable)"])

# turn ID column into str with exactly 8 characters and add "sub-" before that
def convert_patientID_to_str(id):
    length_of_id = int(math.log10(id))+1
    if length_of_id < 8:
        num_of_zeros = 8 - length_of_id
        bids_subject_id = num_of_zeros * "0" + str(int(id))
    else:
        bids_subject_id = str(int(id))
    return bids_subject_id

def convert_patientID_to_bids(id):
    return "sub-" + id

# convert the ID ints to 8 character strings
excel_patients["ID"] = excel_patients["ID"].apply(convert_patientID_to_str)

# add the mri_date column
excel_patients["mri_date"] = excel_patients["ID"].map(mri_date_dict)

# add "sub-" before each patientID
excel_patients["ID"] = excel_patients["ID"].apply(convert_patientID_to_bids)

# convert strings to datetime
excel_patients["mri_date"] = pd.to_datetime(excel_patients["mri_date"])
excel_patients["birthdate"] = pd.to_datetime(excel_patients["birthdate"])

# remove possibly empty columns
dataset_patients = excel_patients.loc[excel_patients.mri_date.isna() == False]

# get only unique columns (some patients are listed multiple times, I really don't know why)
dataset_patients = dataset_patients.drop_duplicates(subset=['ID'], keep='first')

# create a dict with patientID : age
age_dict = {}
for index, row in dataset_patients.iterrows():
    id = row["ID"]
    # calculate the age of the patient when the mri was recorded
    age = relativedelta(row["mri_date"], row["birthdate"]).years
    age_dict[id] = age

# add new age column
dataset_patients["age"] = dataset_patients["ID"].map(age_dict)

# remove mri_date and birthdate columns to perserve anonymity
dataset_patients = dataset_patients.drop(columns=["mri_date", "birthdate"])

dataset_patients = dataset_patients.rename(columns={"ID":"participant_id"})

# add them to the tsv file
all_classification_patients = pd.concat([tsv_patients, dataset_patients])

# remove duplicates
all_classification_patients = all_classification_patients.drop_duplicates(subset=['participant_id'], keep='first')

# consider sorting the tsv file numerically
all_classification_patients = all_classification_patients.sort_values("participant_id")

# replace "w" in sex column with "f"
all_classification_patients["sex (m/w)"].replace("w", "f", inplace=True)

# rename primary column
all_classification_patients = all_classification_patients.rename(columns={"primary_coded_newv":"primary", "sex (m/w)":"sex (m/f)"})

# move primary column all the way to the right
all_classification_patients = all_classification_patients[["participant_id", "sex (m/f)", "age", "primary"]]

# congrats: you have all the patient files
# save the new file
all_classification_patients.to_csv(f"{path_to_brain_mets_classification}/all_participants.tsv", sep = "\t", index = False)

100%|██████████| 467/467 [00:00<00:00, 255776.96it/s]

187





2 patients in the list are CUP patients and shouldn't be in the classification set

In [67]:
all_participants_tsv = pd.read_csv(f"{path_to_brain_mets_classification}/all_participants.tsv", sep="\t")

tsv_ids = all_participants_tsv["participant_id"].values
print(f"tsv count: {len(tsv_ids)}")

patientFolders = [
    folder for folder in os.listdir(path_to_brain_mets_classification) if (os.path.isdir(os.path.join(path_to_brain_mets_classification, folder)) and not folder.startswith("."))
]

print(f"folder count: {len(patientFolders)}")

for folder in tqdm(patientFolders):

    if not folder in tsv_ids:
        path_to_folder = os.path.join(path_to_brain_mets_classification, folder)
        os.system(f"rm -r {path_to_folder}")

tsv count: 465
folder count: 465


100%|██████████| 465/465 [00:00<00:00, 119163.64it/s]
