In [1]:
import os
import pandas as pd

# Define the base directory for the D2_TCPW dataset
base_dir = '../data/UT-EndoMRI/D2_TCPW/'

# List all patient directories
try:
    patient_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d)) and d.startswith('D2-')]
except FileNotFoundError:
    print(f"Error: The directory '{base_dir}' was not found. Please check your data folder structure.")
    patient_dirs = []

# A list to hold the data for each patient
audit_data = []

# Define the file types we are interested in checking for
file_suffixes = {
    'mri_t2fs': '_T2FS.nii.gz',
    'mri_t1': '_T1.nii.gz',
    'mri_t1fs': '_T1FS.nii.gz',
    'mri_t2': '_T2.nii.gz',
    'mask_ut': '_ut.nii.gz',
    'mask_ov': '_ov.nii.gz',
    'mask_cy': '_cy.nii.gz'
}

# Loop through each patient directory and check for files
for patient_id in sorted(patient_dirs):
    patient_path = os.path.join(base_dir, patient_id)
    
    # This dictionary will store the presence (True/False) of each file for the current patient
    patient_info = {'patient_id': patient_id}
    
    # Store the full path for the T2FS MRI and ovary mask if they exist
    t2fs_path = os.path.join(patient_path, f"{patient_id}{file_suffixes['mri_t2fs']}")
    ov_mask_path = os.path.join(patient_path, f"{patient_id}{file_suffixes['mask_ov']}")
    
    patient_info['mri_path'] = t2fs_path if os.path.exists(t2fs_path) else None
    patient_info['mask_path'] = ov_mask_path if os.path.exists(ov_mask_path) else None

    # Check for the existence of each file type
    for key, suffix in file_suffixes.items():
        file_path = os.path.join(patient_path, f"{patient_id}{suffix}")
        patient_info[f'has_{key}'] = os.path.exists(file_path)
        
    audit_data.append(patient_info)

# Create a DataFrame from the audit data
if audit_data:
    audit_df = pd.DataFrame(audit_data)
    
    # Save the DataFrame to a CSV file
    audit_df.to_csv('../data/d2_data_audit.csv', index=False)
    
    print("Data audit complete. Overview saved to '../data/d2_data_audit.csv'")
    
    # --- Summary ---
    print("\n--- Data Audit Summary ---")
    
    # Count patients with both T2FS and Ovary mask (as per the paper's criteria)
    eligible_patients = audit_df[(audit_df['has_mri_t2fs'] == True) & (audit_df['has_mask_ov'] == True)]
    num_eligible = len(eligible_patients)
    total_patients = len(audit_df)
    
    print(f"Total patients in D2_TCPW: {total_patients}")
    print(f"Patients with an ovary mask ('_ov.nii.gz'): {audit_df['has_mask_ov'].sum()}")
    print(f"Patients with a T2FS MRI ('_T2FS.nii.gz'): {audit_df['has_mri_t2fs'].sum()}")
    print(f"\nPatients eligible for the pipeline (have BOTH T2FS and ovary mask): {num_eligible}")
    
    print("\nFirst 5 rows of the audit file:")
    print(audit_df.head())
else:
    print("No patient directories found to audit.")

Data audit complete. Overview saved to '../data/d2_data_audit.csv'

--- Data Audit Summary ---
Total patients in D2_TCPW: 73
Patients with an ovary mask ('_ov.nii.gz'): 58
Patients with a T2FS MRI ('_T2FS.nii.gz'): 70

Patients eligible for the pipeline (have BOTH T2FS and ovary mask): 56

First 5 rows of the audit file:
  patient_id                                           mri_path  \
0     D2-000  ../data/UT-EndoMRI/D2_TCPW/D2-000\D2-000_T2FS....   
1     D2-001  ../data/UT-EndoMRI/D2_TCPW/D2-001\D2-001_T2FS....   
2     D2-002  ../data/UT-EndoMRI/D2_TCPW/D2-002\D2-002_T2FS....   
3     D2-003  ../data/UT-EndoMRI/D2_TCPW/D2-003\D2-003_T2FS....   
4     D2-004  ../data/UT-EndoMRI/D2_TCPW/D2-004\D2-004_T2FS....   

                                           mask_path  has_mri_t2fs  \
0  ../data/UT-EndoMRI/D2_TCPW/D2-000\D2-000_ov.ni...          True   
1  ../data/UT-EndoMRI/D2_TCPW/D2-001\D2-001_ov.ni...          True   
2                                               None          Tr