# Compute 95% CI (Confidence Intervals)
#### 1. Classification examples

In [1]:
import scipy.stats as st
from utils import *

df = pd.read_csv('method1.csv')

metrics = sampling_dataset(df['gt'], df['predict'])

for metric_name in list(metrics.keys()):
    lower, upper = st.t.interval(alpha=0.95, df=len(metrics[metric_name])-1, 
                                 loc=np.mean(metrics[metric_name]), 
                                 scale=st.sem(metrics[metric_name])) 
    print('{} = {:.1f} [95% CI: {:.1f}-{:.1f}]'.format(metric_name,
                                                       np.mean(metrics[metric_name])*100,
                                                       lower*100,
                                                       upper*100,
                                                      ))

100%|██████████| 100/100 [00:00<00:00, 333.52it/s]

roc_auc_score = 78.3 [95% CI: 76.7-79.9]
precision_score = 90.8 [95% CI: 88.9-92.8]
f1_score = 73.0 [95% CI: 70.7-75.2]
sensitivity_score = 62.2 [95% CI: 59.4-65.0]
specificity_score = 94.4 [95% CI: 93.2-95.5]





#### 2. Segmentation examples (e.g., Dice)

In [2]:
import scipy.stats as st
from utils import *

df = pd.read_csv('dice1.csv')

metric_name = 'dice'
metrics = sampling_metrics(df[metric_name])

lower, upper = st.t.interval(alpha=0.95, df=len(metrics)-1, 
                             loc=np.mean(metrics), 
                             scale=st.sem(metrics)) 
print('{} = {:.1f} [95% CI: {:.1f}-{:.1f}]'.format(metric_name,
                                                   np.mean(metrics)*100,
                                                   lower*100,
                                                   upper*100,
                                                  ))

100%|██████████| 100/100 [00:00<00:00, 2969.48it/s]

dice = 80.4 [95% CI: 80.2-80.7]





# Perform t-test between two methods
#### 1. Classification examples

In [3]:
from scipy import stats
from utils import *

df1 = pd.read_csv('method1.csv')
df2 = pd.read_csv('method2.csv')
assert len(df1) == len(df2)

metrics1 = sampling_dataset(df1['gt'], df1['predict'])
metrics2 = sampling_dataset(df2['gt'], df2['predict'])

for metric_name in list(metrics1.keys()):
    _, p = stats.ttest_ind(metrics1[metric_name], metrics2[metric_name])
    print('{}: p-value = {}'.format(metric_name, p))

100%|██████████| 100/100 [00:00<00:00, 334.55it/s]
100%|██████████| 100/100 [00:00<00:00, 339.11it/s]

roc_auc_score: p-value = 6.567026023563492e-44
precision_score: p-value = 0.2840818117666316
f1_score: p-value = 1.4447008471523314e-40
sensitivity_score: p-value = 1.578719095647867e-51
specificity_score: p-value = 0.26311045790490867





#### 2. Segmentation examples (e.g., Dice)

In [4]:
from scipy import stats
from utils import *

df1 = pd.read_csv('dice1.csv')
df2 = pd.read_csv('dice2.csv')

metric_name = 'dice'
metrics1 = sampling_metrics(df1[metric_name])
metrics2 = sampling_metrics(df2[metric_name])

_, p = stats.ttest_ind(metrics1, metrics2)
print('{}: p-value = {}'.format(metric_name, p))

100%|██████████| 100/100 [00:00<00:00, 2789.66it/s]
100%|██████████| 100/100 [00:00<00:00, 3003.20it/s]

dice: p-value = 1.0054090814406298e-25



