In [73]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import datasets
import os
import SimpleITK as sitk
from radiomics import featureextractor, shape, shape2D
from sklearn.model_selection import train_test_split
from collections import defaultdict
import datetime


In [74]:

marksheet_path = r'Data\marksheet.csv'
source_PI_CAI = r'Data/picai_public_images'
source_PI_CAI_whole_gland_Guerbet23 = r'Data/picai_public_images/anatomical_delineations/whole_gland/AI/Guerbet23'
source_PI_CAI_Human = r'Data/picai_public_images/csPCa_lesion_delineations/human_expert/resampled'

def find_t2W_files(dir, combined_ids, fileinfo):
    dir = os.path.normpath(dir)
    t2w_file_paths = []

    for subdir, dirs, files in os.walk(dir):
        subdir = os.path.normpath(subdir)
        for file in files:
            if any(file.endswith(combined_id + fileinfo) for combined_id in combined_ids):
                t2w_file_paths.append(os.path.join(subdir, file))

    return t2w_file_paths

def preprocess_data(marksheet_df):
    print(f"Initial dataset length: {len(marksheet_df)}")

    # 1. Drop data with missing values in the column "PSA" or "Volume"
    marksheet_df = marksheet_df.dropna(subset=['psa', 'prostate_volume'])
    print(f"Dataset length after dropping rows with missing data: {len(marksheet_df)}")
    
    # 2. Filter out duplicates (patients with multiple visits)
    marksheet_df = marksheet_df.sort_values(by=['patient_id', 'mri_date']).drop_duplicates(subset=['patient_id'], keep='first')
    print(f"Dataset length after dropping duplicates: {len(marksheet_df)}")

    # 3. Filter out patients with no biopsy
    biopsy_filter = marksheet_df['histopath_type'].isin(['MRBx', 'SysBx', 'SysBx+MRBx'])
    marksheet_df = marksheet_df[biopsy_filter]
    print(f"Dataset length after filtering biopsies: {len(marksheet_df)}")

    # 4. Convert case_csPCa to binary (1: True, 0: False)
    marksheet_df['case_csPCa'] = marksheet_df['case_csPCa'].apply(lambda x: 1 if x.upper() == 'Yes' else 0)
    
    # Combined ids for use as a unique identifier
    sorted_patient_id = marksheet_df['patient_id']
    sorted_study_id = marksheet_df['study_id']
    combined_ids = sorted_patient_id.astype(str) + '_' + sorted_study_id.astype(str)

    return marksheet_df, combined_ids

def save_df(df, dir, filename, gland_type, date=None):
    if date is None:
        date = datetime.date.today().strftime("%Y%m%d")
    
    path = os.path.join(dir, f"{filename}_{gland_type}_{date}.csv")
    df.to_csv(path, index=False)
    print(f"File saved as {path}")


In [75]:
marksheet_df = pd.read_csv(marksheet_path)
Preprocessed_marksheet_df, combined_ids = preprocess_data(marksheet_df)
Preprocessed_marksheet_df


Initial dataset length: 1500
Dataset length after dropping rows with missing data: 1439
Dataset length after dropping duplicates: 1419
Dataset length after filtering biopsies: 929


Unnamed: 0,patient_id,study_id,mri_date,patient_age,psa,psad,prostate_volume,histopath_type,lesion_GS,lesion_ISUP,case_ISUP,case_csPCa,center
0,10000,1000000,2019-07-02,73,7.70,,55.0,MRBx,0+0,0,0,0,PCNN
3,10003,1000003,2019-04-05,72,13.00,,71.5,SysBx,0+0,0,0,0,ZGT
4,10004,1000004,2020-10-21,67,8.00,0.10,78.0,SysBx+MRBx,"0+0,0+0",00,0,0,RUMC
5,10005,1000005,2012-07-18,64,12.10,0.24,51.0,MRBx,"4+3,0+0",30,3,0,RUMC
6,10006,1000006,2020-10-23,73,6.20,0.23,27.0,SysBx+MRBx,"0+0,3+3",01,1,0,ZGT
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,11471,1001495,2012-08-25,71,12.50,0.21,62.0,MRBx,"3+4,N/A,3+3",21,2,0,RUMC
1496,11472,1001496,2019-06-28,81,5.28,0.12,44.0,SysBx+MRBx,3+4,2,2,0,RUMC
1497,11473,1001497,2017-09-24,56,29.60,0.34,87.0,MRBx,0+0,0,0,0,RUMC
1498,11474,1001498,2016-05-03,71,12.00,,83.0,MRBx,3+3,1,1,0,PCNN


In [None]:

fileInfoT2w = '_t2w.mha'

fileInfo = '.nii.gz'
t2w_paths = find_t2W_files(source_PI_CAI, combined_ids, fileInfoT2w)
Guerbet23_nii_gz_paths = find_t2W_files(source_PI_CAI_whole_gland_Guerbet23, combined_ids, fileInfo)
Human_nii_gz_paths = find_t2W_files(source_PI_CAI_Human, combined_ids, fileInfo)

['Data\\picai_public_images\\picai_public_images_fold0\\10000\\10000_1000000_t2w.mha', 'Data\\picai_public_images\\picai_public_images_fold0\\10003\\10003_1000003_t2w.mha', 'Data\\picai_public_images\\picai_public_images_fold0\\10006\\10006_1000006_t2w.mha', 'Data\\picai_public_images\\picai_public_images_fold0\\10022\\10022_1000022_t2w.mha', 'Data\\picai_public_images\\picai_public_images_fold0\\10023\\10023_1000023_t2w.mha', 'Data\\picai_public_images\\picai_public_images_fold0\\10027\\10027_1000027_t2w.mha', 'Data\\picai_public_images\\picai_public_images_fold0\\10029\\10029_1000029_t2w.mha', 'Data\\picai_public_images\\picai_public_images_fold0\\10032\\10032_1000032_t2w.mha', 'Data\\picai_public_images\\picai_public_images_fold0\\10035\\10035_1000035_t2w.mha', 'Data\\picai_public_images\\picai_public_images_fold0\\10038\\10038_1000038_t2w.mha', 'Data\\picai_public_images\\picai_public_images_fold0\\10040\\10040_1000040_t2w.mha', 'Data\\picai_public_images\\picai_public_images_fold0

In [89]:
t2w_patient_ids = [path.split('\\')[-2] for path in t2w_paths]
Guerbet23_patient_ids = [path.split('\\')[-1].split("_")[0] for path in Guerbet23_nii_gz_paths]

t2w_df = pd.DataFrame({'patient_id': t2w_patient_ids, 'T2w_path': t2w_paths})
Guerbet23_df = pd.DataFrame({'Nii_gz_path': Guerbet23_nii_gz_paths, 'patient_id': Guerbet23_patient_ids})

df_path_merged = pd.merge(t2w_df, Guerbet23_df, on='patient_id')
df_path_merged


Unnamed: 0,patient_id,T2w_path,Nii_gz_path
0,10000,Data\picai_public_images\picai_public_images_f...,Data\picai_public_images\anatomical_delineatio...
1,10003,Data\picai_public_images\picai_public_images_f...,Data\picai_public_images\anatomical_delineatio...
2,10006,Data\picai_public_images\picai_public_images_f...,Data\picai_public_images\anatomical_delineatio...
3,10022,Data\picai_public_images\picai_public_images_f...,Data\picai_public_images\anatomical_delineatio...
4,10023,Data\picai_public_images\picai_public_images_f...,Data\picai_public_images\anatomical_delineatio...
...,...,...,...
924,11462,Data\picai_public_images\picai_public_images_f...,Data\picai_public_images\anatomical_delineatio...
925,11463,Data\picai_public_images\picai_public_images_f...,Data\picai_public_images\anatomical_delineatio...
926,11464,Data\picai_public_images\picai_public_images_f...,Data\picai_public_images\anatomical_delineatio...
927,11470,Data\picai_public_images\picai_public_images_f...,Data\picai_public_images\anatomical_delineatio...
