# Compute 95% CI (Confidence Intervals)
#### 1. Classification examples

In [1]:
import scipy.stats as st
from utils import *

df = pd.read_csv('results/method1.csv')

metrics = sampling_dataset(df['gt'], df['predict'])

for metric_name in list(metrics.keys()):
    lower, upper = st.t.interval(alpha=0.95, df=len(metrics[metric_name])-1, 
                                 loc=np.mean(metrics[metric_name]), 
                                 scale=st.sem(metrics[metric_name])) 
    print('{} = {:.1f} [95% CI: {:.1f}-{:.1f}]'.format(metric_name,
                                                       np.mean(metrics[metric_name])*100,
                                                       lower*100,
                                                       upper*100,
                                                      ))

100%|██████████| 100/100 [00:00<00:00, 322.89it/s]

roc_auc_score = 76.8 [95% CI: 75.3-78.3]
precision_score = 89.7 [95% CI: 87.4-91.9]
f1_score = 70.5 [95% CI: 68.1-72.9]
sensitivity_score = 59.3 [95% CI: 56.5-62.0]
specificity_score = 94.4 [95% CI: 93.3-95.5]





#### 2. Segmentation examples (e.g., Dice)

In [7]:
import scipy.stats as st
from utils import *

df = pd.read_csv('results/dice1.csv')

metric_name = 'dice'

lower, upper = st.t.interval(alpha=0.95, df=len(df[metric_name])-1, 
                             loc=np.mean(df[metric_name]), 
                             scale=st.sem(df[metric_name])) 
print('{} = {:.1f} [95% CI: {:.1f}-{:.1f}]'.format(metric_name,
                                                   np.mean(df[metric_name])*100,
                                                   lower*100,
                                                   upper*100,
                                                  ))

dice = 68.2 [95% CI: 66.1-70.4]


# Perform t-test between two methods
#### 1. Classification examples

In [3]:
from scipy import stats
from utils import *

df1 = pd.read_csv('results/method1.csv')
df2 = pd.read_csv('results/method2.csv')
assert len(df1) == len(df2)

metrics1 = sampling_dataset(df1['gt'], df1['predict'])
metrics2 = sampling_dataset(df2['gt'], df2['predict'])

for metric_name in list(metrics1.keys()):
    _, p = stats.ttest_ind(metrics1[metric_name], metrics2[metric_name])
    print('{}: p-value = {}'.format(metric_name, p))

100%|██████████| 100/100 [00:00<00:00, 331.61it/s]
100%|██████████| 100/100 [00:00<00:00, 334.77it/s]

roc_auc_score: p-value = 1.6314290780040034e-46
precision_score: p-value = 0.002488510548498789
f1_score: p-value = 7.608229090120039e-44
sensitivity_score: p-value = 1.74825890056941e-57
specificity_score: p-value = 0.8388475014623618





#### 2. Segmentation examples (e.g., Dice)

In [8]:
from scipy import stats
from utils import *

df1 = pd.read_csv('results/dice1.csv')
df2 = pd.read_csv('results/dice2.csv')

metric_name = 'dice'

_, p = stats.ttest_ind(df1[metric_name], df2[metric_name])
print('{}: p-value = {}'.format(metric_name, p))

dice: p-value = 0.12907317702410417
