# Compute 95% CI (Confidence Intervals)
#### 1. Classification examples

In [1]:
import scipy.stats as st
from utils import *

df = pd.read_csv('results/method1.csv')

metrics = sampling_dataset(df['gt'], df['predict'])

for metric_name in list(metrics.keys()):
    lower, upper = st.t.interval(alpha=0.95, df=len(metrics[metric_name])-1, 
                                 loc=np.mean(metrics[metric_name]), 
                                 scale=st.sem(metrics[metric_name])) 
    print('{} = {:.1f} [95% CI: {:.1f}-{:.1f}]'.format(metric_name,
                                                       np.mean(metrics[metric_name])*100,
                                                       lower*100,
                                                       upper*100,
                                                      ))

100%|██████████| 100/100 [00:00<00:00, 322.69it/s]

roc_auc_score = 79.2 [95% CI: 77.6-80.8]
precision_score = 90.9 [95% CI: 88.8-93.1]
f1_score = 74.0 [95% CI: 71.5-76.4]
sensitivity_score = 63.6 [95% CI: 60.7-66.5]
specificity_score = 94.8 [95% CI: 93.7-96.0]





#### 2. Segmentation examples (e.g., Dice)

In [2]:
import scipy.stats as st
from utils import *

df = pd.read_csv('results/dice1.csv')

metric_name = 'dice'
metrics = sampling_metrics(df[metric_name])

lower, upper = st.t.interval(alpha=0.95, df=len(metrics)-1, 
                             loc=np.mean(metrics), 
                             scale=st.sem(metrics)) 
print('{} = {:.1f} [95% CI: {:.1f}-{:.1f}]'.format(metric_name,
                                                   np.mean(metrics)*100,
                                                   lower*100,
                                                   upper*100,
                                                  ))

100%|██████████| 100/100 [00:00<00:00, 3005.08it/s]

dice = 80.6 [95% CI: 80.3-80.8]





# Perform t-test between two methods
#### 1. Classification examples

In [3]:
from scipy import stats
from utils import *

df1 = pd.read_csv('results/method1.csv')
df2 = pd.read_csv('results/method2.csv')
assert len(df1) == len(df2)

metrics1 = sampling_dataset(df1['gt'], df1['predict'])
metrics2 = sampling_dataset(df2['gt'], df2['predict'])

for metric_name in list(metrics1.keys()):
    _, p = stats.ttest_ind(metrics1[metric_name], metrics2[metric_name])
    print('{}: p-value = {}'.format(metric_name, p))

100%|██████████| 100/100 [00:00<00:00, 327.99it/s]
100%|██████████| 100/100 [00:00<00:00, 332.76it/s]

roc_auc_score: p-value = 3.666751085859525e-46
precision_score: p-value = 0.009914021800709827
f1_score: p-value = 1.7812730328180964e-43
sensitivity_score: p-value = 1.124461272966562e-56
specificity_score: p-value = 0.9991139085712636





#### 2. Segmentation examples (e.g., Dice)

In [4]:
from scipy import stats
from utils import *

df1 = pd.read_csv('results/dice1.csv')
df2 = pd.read_csv('results/dice2.csv')

metric_name = 'dice'
metrics1 = sampling_metrics(df1[metric_name])
metrics2 = sampling_metrics(df2[metric_name])

_, p = stats.ttest_ind(metrics1, metrics2)
print('{}: p-value = {}'.format(metric_name, p))

100%|██████████| 100/100 [00:00<00:00, 3015.68it/s]
100%|██████████| 100/100 [00:00<00:00, 2968.07it/s]

dice: p-value = 1.2958748749359716e-22



