In [1]:
import numpy as np
import nibabel
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle

In [2]:
root = Path('/anvar/public_datasets/preproc_study/bgpd/')

In [35]:
def get_pixel_arrays(root, folder):
    img = dict()

#     img['BRAIN'] = nibabel.load(root / '5_ss_shared' / folder.name / 'FLAIR_mask.nii.gz').get_fdata() # BGPD
#     for modality in ['CT1', 'FLAIR', 'T1', 'T2', 'mask_GTV_FLAIR']: # BGPD
    img['BRAIN'] = nibabel.load(root / '5_ss_shared' / folder.name / 'CT1_mask.nii.gz').get_fdata() # GBM
    for modality in ['CT1', 'FLAIR', 'T1', 'T2', 'CT1_SEG']: # GBM
        img[modality] = nibabel.load(folder / f'{modality}.nii.gz').get_fdata()

    img_healthy = dict()
    img_tumour = dict()
    for modality in ['CT1', 'FLAIR', 'T1', 'T2']:
        img_healthy[modality] = img[modality][img['BRAIN'].astype(bool)^img['CT1_SEG'].astype(bool)]
        img_tumour[modality] = img[modality][img['CT1_SEG'].astype(bool)]

    return img_healthy, img_tumour

In [82]:
def get_iqr(x):
    return np.percentile(x, 75) - np.percentile(x, 25)

def get_n_bins_freedman_diaconis(x):
    n = len(x)
    iqr = get_iqr(x)
    h = 2 * iqr / np.power(n, 1/3)
    return int(np.ceil((np.max(x) - np.min(x)) / h))

def compute_histograms(x, y, n=100):
    size, bins = np.histogram(list(x)+list(y), n)
    x_h, _ = np.histogram(x, bins=bins)
    y_h, _ = np.histogram(y, bins=bins)
    
    return {
        'bins': bins,
        'healthy_h': x_h,
        'tumour_h': y_h
    }

def save_hist(x, y, dst_folder, patient, modality):
    dst_root = dst_folder / patient
    if not dst_root.exists():
        dst_root.mkdir(parents=True)

    # 100
    histogram_data = compute_histograms(x, y, 100)
    with open(dst_root / f'{modality}_bins_100.pkl', 'wb') as f:
        pickle.dump(histogram_data, f)
    
    # Freedman–Diaconis rule
    n = get_n_bins_freedman_diaconis(list(x)+list(y))
    histogram_data = compute_histograms(x, y, n)
    with open(dst_root / f'{modality}_bins_freedman_diaconis.pkl', 'wb') as f:
        pickle.dump(histogram_data, f)
    
    # Sturges' formula
    n = int(np.ceil(np.log2(len(list(x)+list(y)))))+1
    histogram_data = compute_histograms(x, y, n)
    with open(dst_root / f'{modality}_bins_sturges.pkl', 'wb') as f:
        pickle.dump(histogram_data, f)
    
def compute_kl(x_h, y_h):

    x = x_h/x_h.sum()
    y = y_h/y_h.sum()
    
    x = np.clip(x, 10e-9, 1)
    y = np.clip(y, 10e-9, 1)
            
    return (x*np.log2(x/y)).sum(), x, y

def compute_l2(x_h, y_h):
    x = x_h/x_h.sum()
    y = y_h/y_h.sum()
    
    return np.sqrt(((x-y)**2).sum())

def compute_hellinger(x_h, y_h):
    x = x_h/x_h.sum()
    y = y_h/y_h.sum()
    return 1 / np.sqrt(2) * np.sqrt(((np.sqrt(x)-np.sqrt(y))**2).sum())

# Save histograms BGPD

In [5]:
for folder in [
    root / '4a_resamp',
    root / '4b_n4', 
    root / '4d_susan', 
    root / '6_hist' / '6_hist_fold_0']:
    for patient in tqdm(folder.glob('*')):
        healthy, tumour = get_pixel_arrays(root, patient)
        for modality in ['CT1', 'FLAIR', 'T1', 'T2']:
            save_hist(
                healthy[modality],
                tumour[modality],
                Path('/anvar/public_datasets/preproc_study/histograms/bgpd/') / folder.name,
                patient.name,
                modality
            )

180it [21:44,  7.25s/it]
180it [24:14,  8.08s/it]
180it [20:30,  6.84s/it]
180it [21:17,  7.10s/it]


# Save histograms GBM

In [36]:
root = Path('/anvar/public_datasets/preproc_study/gbm/')

In [37]:
for folder in [
    root / '4a_resamp',
    root / '4b_n4', 
    root / '4d_susan', 
    root / '6_hist' / '6_hist_fold_0']:
    for patient in tqdm(folder.glob('*')):
        healthy, tumour = get_pixel_arrays(root, patient)
        for modality in ['CT1', 'FLAIR', 'T1', 'T2']:
            save_hist(
                healthy[modality],
                tumour[modality],
                Path('/anvar/public_datasets/preproc_study/histograms/gbm/') / folder.name,
                patient.name,
                modality
            )

102it [11:43,  6.90s/it]
102it [13:30,  7.95s/it]
102it [11:21,  6.68s/it]
102it [12:49,  7.54s/it]


# Save histograms LGG

In [40]:
root = Path('/anvar/public_datasets/preproc_study/lgg/')

In [41]:
for folder in [
    root / '4a_resamp',
    root / '4b_n4', 
    root / '4d_susan', 
    root / '6_hist' / '6_hist_fold_0']:
    for patient in tqdm(folder.glob('*')):
        healthy, tumour = get_pixel_arrays(root, patient)
        for modality in ['CT1', 'FLAIR', 'T1', 'T2']:
            save_hist(
                healthy[modality],
                tumour[modality],
                Path('/anvar/public_datasets/preproc_study/histograms/lgg/') / folder.name,
                patient.name,
                modality
            )

38it [04:15,  6.71s/it]
38it [05:19,  8.41s/it]
38it [04:11,  6.62s/it]
38it [04:44,  7.50s/it]


# Compute distances BGPD, LGG, GBM

In [57]:
import pandas as pd

In [85]:
root = Path('/anvar/public_datasets/preproc_study/histograms/bgpd')
dfs = []
for experiment in root.glob('*'):
    df = pd.DataFrame()
    exps = []
    kl_distances = []
    js_distances = []
    patients = []
    modalities = []
    bins_list = []
    l2_dist = []
    hellinger_dist = []
    for patient in tqdm(experiment.glob('*')):
        for bins in ['100', 'freedman_diaconis', 'sturges']:
            for modality in ['CT1', 'FLAIR', 'T1', 'T2']:
                with open(patient / f'{modality}_bins_{bins}.pkl', 'rb') as f:
                    histograms = pickle.load(f)
                    healthy_h = histograms['healthy_h']
                    tumour_h = histograms['tumour_h']
                    d, _, _ = compute_kl(healthy_h, tumour_h)
                    d2, _, _ = compute_kl(tumour_h, healthy_h)
                    l2 = compute_l2(healthy_h, tumour_h)
                    hd = compute_hellinger(healthy_h, tumour_h) 
                    
                    exps.append(experiment.name)
                    kl_distances.append(d)
                    js_distances.append(np.sqrt((d+d2)/2))
                    patients.append(patient.name)
                    modalities.append(modality)
                    bins_list.append(bins)
                    l2_dist.append(l2)
                    hellinger_dist.append(hd)
    df['exp_name'] = exps
    df['KL'] = kl_distances
    df['JS'] = js_distances
    df['L2'] = l2_dist
    df['Hellinger'] = hellinger_dist
    df['patient_name'] = patients
    df['modality'] = modalities
    df['bins_choice'] = bins_list
    dfs.append(df)
df = pd.concat(dfs)
df.to_csv('~/hist_stats_bgpd.csv')

180it [00:00, 420.80it/s]
180it [00:00, 424.30it/s]
180it [00:00, 435.61it/s]
180it [00:00, 423.55it/s]


In [86]:
root = Path('/anvar/public_datasets/preproc_study/histograms/lgg')
dfs = []
for experiment in root.glob('*'):
    df = pd.DataFrame()
    exps = []
    kl_distances = []
    js_distances = []
    patients = []
    modalities = []
    bins_list = []
    l2_dist = []
    hellinger_dist = []
    for patient in tqdm(experiment.glob('*')):
        for bins in ['100', 'freedman_diaconis', 'sturges']:
            for modality in ['CT1', 'FLAIR', 'T1', 'T2']:
                with open(patient / f'{modality}_bins_{bins}.pkl', 'rb') as f:
                    histograms = pickle.load(f)
                    healthy_h = histograms['healthy_h']
                    tumour_h = histograms['tumour_h']
                    d, _, _ = compute_kl(healthy_h, tumour_h)
                    d2, _, _ = compute_kl(tumour_h, healthy_h)
                    l2 = compute_l2(healthy_h, tumour_h)
                    hd = compute_hellinger(healthy_h, tumour_h) 
                    
                    exps.append(experiment.name)
                    kl_distances.append(d)
                    js_distances.append(np.sqrt((d+d2)/2))
                    patients.append(patient.name)
                    modalities.append(modality)
                    bins_list.append(bins)
                    l2_dist.append(l2)
                    hellinger_dist.append(hd)
    df['exp_name'] = exps
    df['KL'] = kl_distances
    df['JS'] = js_distances
    df['L2'] = l2_dist
    df['Hellinger'] = hellinger_dist
    df['patient_name'] = patients
    df['modality'] = modalities
    df['bins_choice'] = bins_list
    dfs.append(df)
df = pd.concat(dfs)
df.to_csv('~/hist_stats_lgg.csv')

38it [00:00, 424.92it/s]
38it [00:00, 426.61it/s]
38it [00:00, 433.35it/s]
38it [00:00, 422.04it/s]


In [87]:
root = Path('/anvar/public_datasets/preproc_study/histograms/gbm')
dfs = []
for experiment in root.glob('*'):
    df = pd.DataFrame()
    exps = []
    kl_distances = []
    js_distances = []
    patients = []
    modalities = []
    bins_list = []
    l2_dist = []
    hellinger_dist = []
    for patient in tqdm(experiment.glob('*')):
        for bins in ['100', 'freedman_diaconis', 'sturges']:
            for modality in ['CT1', 'FLAIR', 'T1', 'T2']:
                with open(patient / f'{modality}_bins_{bins}.pkl', 'rb') as f:
                    histograms = pickle.load(f)
                    healthy_h = histograms['healthy_h']
                    tumour_h = histograms['tumour_h']
                    d, _, _ = compute_kl(healthy_h, tumour_h)
                    d2, _, _ = compute_kl(tumour_h, healthy_h)
                    l2 = compute_l2(healthy_h, tumour_h)
                    hd = compute_hellinger(healthy_h, tumour_h) 
                    
                    exps.append(experiment.name)
                    kl_distances.append(d)
                    js_distances.append(np.sqrt((d+d2)/2))
                    patients.append(patient.name)
                    modalities.append(modality)
                    bins_list.append(bins)
                    l2_dist.append(l2)
                    hellinger_dist.append(hd)
    df['exp_name'] = exps
    df['KL'] = kl_distances
    df['JS'] = js_distances
    df['L2'] = l2_dist
    df['Hellinger'] = hellinger_dist
    df['patient_name'] = patients
    df['modality'] = modalities
    df['bins_choice'] = bins_list
    dfs.append(df)
df = pd.concat(dfs)
df.to_csv('~/hist_stats_gbm.csv')

102it [00:00, 421.18it/s]
102it [00:00, 418.01it/s]
102it [00:00, 436.48it/s]
102it [00:00, 416.30it/s]


In [88]:
df_gbm = pd.read_csv('~/hist_stats_gbm.csv', index_col=0)
df_bgpd = pd.read_csv('~/hist_stats_bgpd.csv', index_col=0)
df_lgg = pd.read_csv('~/hist_stats_lgg.csv', index_col=0)

## KL

In [89]:
# df_gbm.query('bins_choice=="100"').groupby(['exp_name', 'modality'])['KL'].agg(['mean', 'median', 'std']).round(2)

In [90]:
df_gbm.query('bins_choice=="100"').groupby(['exp_name'])['KL'].agg(['mean', 'median', 'std']).round(2)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,3.28,2.22,2.94
4b_n4,2.25,1.61,2.04
4d_susan,3.5,2.3,3.19
6_hist_fold_0,3.21,2.05,2.99


In [91]:
df_bgpd.query('bins_choice=="100"').groupby(['exp_name'])['KL'].agg(['mean', 'median', 'std']).round(2)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,1.2,0.81,1.11
4b_n4,0.87,0.66,0.73
4d_susan,1.26,0.85,1.16
6_hist_fold_0,1.2,0.8,1.13


In [92]:
df_lgg.query('bins_choice=="100"').groupby(['exp_name'])['KL'].agg(['mean', 'median', 'std']).round(2)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,2.52,1.28,2.91
4b_n4,1.72,0.96,2.14
4d_susan,2.62,1.32,3.04
6_hist_fold_0,2.53,1.3,2.94


## JS

In [93]:
df_gbm.query('bins_choice=="100"').groupby(['exp_name'])['JS'].agg(['mean', 'median', 'std']).round(2)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,1.48,1.31,0.66
4b_n4,1.26,1.13,0.54
4d_susan,1.51,1.32,0.68
6_hist_fold_0,1.45,1.27,0.67


In [94]:
df_bgpd.query('bins_choice=="100"').groupby(['exp_name'])['JS'].agg(['mean', 'median', 'std']).round(2)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,1.05,0.93,0.51
4b_n4,0.92,0.84,0.43
4d_susan,1.07,0.95,0.51
6_hist_fold_0,1.05,0.93,0.51


In [95]:
df_lgg.query('bins_choice=="100"').groupby(['exp_name'])['JS'].agg(['mean', 'median', 'std']).round(2)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,1.27,1.08,0.67
4b_n4,1.06,0.91,0.57
4d_susan,1.29,1.11,0.68
6_hist_fold_0,1.26,1.09,0.67


# L2

In [96]:
df_gbm.query('bins_choice=="100"').groupby(['exp_name'])['L2'].agg(['mean', 'median', 'std']).round(3)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,0.184,0.183,0.067
4b_n4,0.167,0.159,0.065
4d_susan,0.187,0.185,0.068
6_hist_fold_0,0.178,0.18,0.069


In [97]:
df_bgpd.query('bins_choice=="100"').groupby(['exp_name'])['L2'].agg(['mean', 'median', 'std']).round(3)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,0.137,0.129,0.065
4b_n4,0.123,0.114,0.058
4d_susan,0.14,0.131,0.066
6_hist_fold_0,0.134,0.124,0.063


In [98]:
df_lgg.query('bins_choice=="100"').groupby(['exp_name'])['L2'].agg(['mean', 'median', 'std']).round(3)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,0.178,0.181,0.08
4b_n4,0.157,0.147,0.08
4d_susan,0.18,0.185,0.08
6_hist_fold_0,0.174,0.173,0.08


## Hellinger

In [99]:
df_gbm.query('bins_choice=="100"').groupby(['exp_name'])['Hellinger'].agg(['mean', 'median', 'std']).round(3)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,0.502,0.454,0.2
4b_n4,0.437,0.394,0.175
4d_susan,0.509,0.461,0.201
6_hist_fold_0,0.495,0.448,0.203


In [100]:
df_bgpd.query('bins_choice=="100"').groupby(['exp_name'])['Hellinger'].agg(['mean', 'median', 'std']).round(3)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,0.381,0.35,0.163
4b_n4,0.339,0.319,0.14
4d_susan,0.386,0.353,0.164
6_hist_fold_0,0.38,0.35,0.163


In [101]:
df_lgg.query('bins_choice=="100"').groupby(['exp_name'])['Hellinger'].agg(['mean', 'median', 'std']).round(3)

Unnamed: 0_level_0,mean,median,std
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4a_resamp,0.444,0.409,0.201
4b_n4,0.377,0.337,0.181
4d_susan,0.45,0.411,0.203
6_hist_fold_0,0.444,0.409,0.201


In [81]:
0.03*np.log2(0.03/10e-6)

0.34652240356149727