# Compute 95% CI (Confidence Intervals)
#### 1. Classification examples

In [1]:
import scipy.stats as st
from utils import *

df = pd.read_csv('results/method1.csv')

metrics = sampling_dataset(df['gt'], df['predict'])

for metric_name in list(metrics.keys()):
    lower, upper = st.t.interval(alpha=0.95, df=len(metrics[metric_name])-1, 
                                 loc=np.mean(metrics[metric_name]), 
                                 scale=st.sem(metrics[metric_name])) 
    print('{} = {:.1f} [95% CI: {:.1f}-{:.1f}]'.format(metric_name,
                                                       np.mean(metrics[metric_name])*100,
                                                       lower*100,
                                                       upper*100,
                                                      ))

100%|██████████| 100/100 [00:00<00:00, 207.35it/s]

roc_auc_score = 78.3 [95% CI: 76.8-79.9]
precision_score = 90.5 [95% CI: 88.6-92.5]
f1_score = 73.2 [95% CI: 71.0-75.3]
sensitivity_score = 62.4 [95% CI: 59.7-65.0]
specificity_score = 94.3 [95% CI: 93.2-95.4]





#### 2. Segmentation examples (e.g., Dice)

In [2]:
import scipy.stats as st
from utils import *

df = pd.read_csv('results/dice1.csv')

metric_name = 'dice'

lower, upper = st.t.interval(alpha=0.95, df=len(df[metric_name])-1, 
                             loc=np.mean(df[metric_name]), 
                             scale=st.sem(df[metric_name])) 
print('{} = {:.1f} [95% CI: {:.1f}-{:.1f}]'.format(metric_name,
                                                   np.mean(df[metric_name])*100,
                                                   lower*100,
                                                   upper*100,
                                                  ))

dice = 80.5 [95% CI: 77.8-83.1]


# Perform t-test between two methods
#### 1. Classification examples

In [3]:
from scipy import stats
from utils import *

df1 = pd.read_csv('results/method1.csv')
df2 = pd.read_csv('results/method2.csv')
assert len(df1) == len(df2)

metrics1 = sampling_dataset(df1['gt'], df1['predict'])
metrics2 = sampling_dataset(df2['gt'], df2['predict'])

for metric_name in list(metrics1.keys()):
    _, p = stats.ttest_ind(metrics1[metric_name], metrics2[metric_name])
    print('{}: p-value = {}'.format(metric_name, p))

100%|██████████| 100/100 [00:00<00:00, 226.07it/s]
100%|██████████| 100/100 [00:00<00:00, 218.29it/s]

roc_auc_score: p-value = 3.2980039782030604e-52
precision_score: p-value = 0.12109742497671155
f1_score: p-value = 3.1316104740099924e-47
sensitivity_score: p-value = 7.157016138570799e-63
specificity_score: p-value = 0.27745889593818807





#### 2. Segmentation examples (e.g., Dice)

In [4]:
from scipy import stats
from utils import *

df1 = pd.read_csv('results/dice1.csv')
df2 = pd.read_csv('results/dice2.csv')

metric_name = 'dice'

_, p = stats.ttest_ind(df1[metric_name], df2[metric_name])
print('{}: p-value = {}'.format(metric_name, p))

dice: p-value = 0.2248721032236578
