In [2]:
import SimpleITK as sitk
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

## Find Path

In [3]:
f = sorted(os.listdir('picai_public_images_fold0'))
len(f)

295

In [4]:
name_list = []
for i in f:
    temp_l = sorted(os.listdir(f'picai_public_images_fold0/{i}'))
    for j in temp_l:
        name_list.append(f'picai_public_images_fold0/{i}/{j}')
len(name_list)

1500

In [5]:
name_list

['picai_public_images_fold0/10000/10000_1000000_adc.mha',
 'picai_public_images_fold0/10000/10000_1000000_cor.mha',
 'picai_public_images_fold0/10000/10000_1000000_hbv.mha',
 'picai_public_images_fold0/10000/10000_1000000_sag.mha',
 'picai_public_images_fold0/10000/10000_1000000_t2w.mha',
 'picai_public_images_fold0/10001/10001_1000001_adc.mha',
 'picai_public_images_fold0/10001/10001_1000001_cor.mha',
 'picai_public_images_fold0/10001/10001_1000001_hbv.mha',
 'picai_public_images_fold0/10001/10001_1000001_sag.mha',
 'picai_public_images_fold0/10001/10001_1000001_t2w.mha',
 'picai_public_images_fold0/10003/10003_1000003_adc.mha',
 'picai_public_images_fold0/10003/10003_1000003_cor.mha',
 'picai_public_images_fold0/10003/10003_1000003_hbv.mha',
 'picai_public_images_fold0/10003/10003_1000003_sag.mha',
 'picai_public_images_fold0/10003/10003_1000003_t2w.mha',
 'picai_public_images_fold0/10006/10006_1000006_adc.mha',
 'picai_public_images_fold0/10006/10006_1000006_cor.mha',
 'picai_public

In [6]:
grouped_files = defaultdict(list)

for path in name_list:
    suffix = path.split('_')[-1].split('.')[0]
    grouped_files[suffix].append(path)

grouped_files = dict(grouped_files)
grouped_files

{'adc': ['picai_public_images_fold0/10000/10000_1000000_adc.mha',
  'picai_public_images_fold0/10001/10001_1000001_adc.mha',
  'picai_public_images_fold0/10003/10003_1000003_adc.mha',
  'picai_public_images_fold0/10006/10006_1000006_adc.mha',
  'picai_public_images_fold0/10017/10017_1000017_adc.mha',
  'picai_public_images_fold0/10020/10020_1000020_adc.mha',
  'picai_public_images_fold0/10022/10022_1000022_adc.mha',
  'picai_public_images_fold0/10023/10023_1000023_adc.mha',
  'picai_public_images_fold0/10027/10027_1000027_adc.mha',
  'picai_public_images_fold0/10029/10029_1000029_adc.mha',
  'picai_public_images_fold0/10032/10032_1000032_adc.mha',
  'picai_public_images_fold0/10033/10033_1000033_adc.mha',
  'picai_public_images_fold0/10035/10035_1000035_adc.mha',
  'picai_public_images_fold0/10038/10038_1000038_adc.mha',
  'picai_public_images_fold0/10040/10040_1000040_adc.mha',
  'picai_public_images_fold0/10041/10041_1000041_adc.mha',
  'picai_public_images_fold0/10045/10045_1000045_

In [7]:
grouped_files.keys()

dict_keys(['adc', 'cor', 'hbv', 'sag', 't2w'])

## Show metadata

In [None]:
def show_metadata(g_file):
    for i in g_file:
        image = sitk.ReadImage(i)
    
        keys = image.GetMetaDataKeys()
    
        for key in keys:
            print(f"{key}: {image.GetMetaData(key)}")
        print('-' * 70)

In [None]:
show_metadata(grouped_files['cor'][:5])

In [None]:
show_metadata(grouped_files['hbv'][:5])

In [None]:
show_metadata(grouped_files['sag'][:5])

In [None]:
show_metadata(grouped_files['t2w'][:5])

In [None]:
show_metadata(grouped_files['adc'][:5])

## Separate DataFrame by filetype

In [None]:
def df_builder(g_file, column):
    df = pd.DataFrame(columns=column)
    
    d = dict()
    
    for i in g_file:
        image = sitk.ReadImage(i)
    
        keys = image.GetMetaDataKeys()
    
        for key, keyy in zip(keys, column):
            d[keyy] = image.GetMetaData(key)
    
        new_row = pd.DataFrame(d, index = [0])
        df = pd.concat([df, new_row], ignore_index=True)
    
    return df

In [None]:
k = ['Study Date', 'Modality', 'Manufacturer', "Manufacturer's Model Name",
     'Patient ID', "Patient's Sex", "Patient's Age", "Patient Identity Removed",
     "Study Instance UID", 'ANONYMISATION_SCRIPT', 'ITK_InputFilterName', 'ITK_original_direction',
     'ITK_original_spacing', 'Modality (Repeated)', 'PROSTATE_VOLUME_REPORT', 'PSAD_REPORT', 'PSA_REPORT']

k_hbv = ['Study_Date', 'Modality', 'Manufacturer', "Manufacturer's_Model_Name",
     'Patient_ID', "Patient's_Sex", "Patient's_Age", "Patient_Identity_Removed", "Diffusion_sensitization_factor sec/mm",
     "Study Instance_UID", 'ANONYMISATION_SCRIPT', 'ITK_InputFilterName', 'ITK_original_direction',
     'ITK_original_spacing', 'Modality_(Repeated)', 'PROSTATE_VOLUME_REPORT', 'PSAD_REPORT', 'PSA_REPORT']

df_adc = df_builder(grouped_files['adc'], column=k)
df_adc

In [None]:
df_cor = df_builder(grouped_files['cor'], column=k)
df_cor

In [None]:
df_sag = df_builder(grouped_files['sag'], column=k)
df_sag

In [None]:
df_t2w = df_builder(grouped_files['t2w'], column=k)
df_t2w

In [None]:
df_hbv = df_builder(grouped_files['hbv'], column=k_hbv)
df_hbv

## EDA Dateframe Metadata

In [None]:
print(df_adc.duplicated(k).sum())
print(df_cor.duplicated(k).sum())
print(df_hbv.duplicated(k_hbv).sum())
print(df_sag.duplicated(k).sum())
print(df_t2w.duplicated(k).sum())

In [None]:
df_adc.info()

In [None]:
df_adc['Study Date'] = pd.to_datetime(df_adc['Study Date'])
df_cor['Study Date'] = pd.to_datetime(df_cor['Study Date'])
df_sag['Study Date'] = pd.to_datetime(df_sag['Study Date'])
df_t2w['Study Date'] = pd.to_datetime(df_t2w['Study Date'])
df_hbv['Study_Date'] = pd.to_datetime(df_hbv['Study_Date'])

In [None]:
def patient_age(df, column):
    patient_age = [int(item[:-1]) for item in list(df[column])]
    df[column] = patient_age
    return df

In [None]:
patient_age(df_adc, 'Patient\'s Age')
patient_age(df_cor, 'Patient\'s Age')
patient_age(df_sag, 'Patient\'s Age')
patient_age(df_t2w, 'Patient\'s Age')
patient_age(df_hbv, "Patient's_Age" )
print('done')

In [None]:
def df_value_counts(df, columns):
    for column in columns:
        print(df[column].value_counts())
        print("-" * 70)

In [None]:
df_value_counts(df_adc, k)

In [None]:
df_adc = df_adc.replace('nan', np.nan)
df_cor = df_cor.replace('nan', np.nan)
df_hbv = df_hbv.replace('nan', np.nan)
df_sag = df_sag.replace('nan', np.nan)
df_t2w = df_t2w.replace('nan', np.nan)

In [None]:
df_adc.info()

In [None]:
df_adc.isna().sum()

In [None]:
df_cor.isna().sum()

In [None]:
df_t2w.isna().sum()

In [None]:
df_hbv.isna().sum()

In [None]:
df_sag.isna().sum()

In [None]:
df_hbv