# Compute 95% CI (Confidence Intervals)
#### 1. Classification examples

In [1]:
import scipy.stats as st
from utils import *

df = pd.read_csv('results/method1.csv')

metrics = sampling_dataset(df['gt'], df['predict'])

for metric_name in metrics.keys():
    lower, upper = st.t.interval(alpha=0.95, df=len(metrics[metric_name])-1, 
                                 loc=np.mean(metrics[metric_name]), 
                                 scale=st.sem(metrics[metric_name])) 
    print('{} = {:.1f} [95% CI: {:.1f}-{:.1f}]'.format(metric_name,
                                                       np.mean(metrics[metric_name])*100,
                                                       lower*100,
                                                       upper*100,
                                                      ))

100%|██████████| 100/100 [00:00<00:00, 264.10it/s]

roc_auc_score = 78.6 [95% CI: 77.0-80.2]
precision_score = 89.5 [95% CI: 87.5-91.4]
f1_score = 73.5 [95% CI: 71.3-75.7]
jaccard_score = 59.3 [95% CI: 56.5-62.0]
sensitivity_score = 63.4 [95% CI: 60.7-66.2]
specificity_score = 93.8 [95% CI: 92.6-94.9]





#### 2. Segmentation examples (e.g., Dice)

In [2]:
import scipy.stats as st
from utils import *

df = pd.read_csv('results/dice1.csv')

metric_name = 'dice'

lower, upper = st.t.interval(alpha=0.95, df=len(df[metric_name])-1, 
                             loc=np.mean(df[metric_name]), 
                             scale=st.sem(df[metric_name])) 
print('{} = {:.1f} [95% CI: {:.1f}-{:.1f}]'.format(metric_name,
                                                   np.mean(df[metric_name])*100,
                                                   lower*100,
                                                   upper*100,
                                                  ))

dice = 68.2 [95% CI: 66.1-70.4]


# Perform t-test between two methods
#### 1. Classification examples

In [22]:
from scipy import stats
from utils import *

df1 = pd.read_csv('results/method1.csv')
df2 = pd.read_csv('results/method2.csv')
assert len(df1) == len(df2)

metrics1 = sampling_dataset(df1['gt'], df1['predict'])
metrics2 = sampling_dataset(df2['gt'], df2['predict'])

for metric_name in metrics1.keys():
    p = stats.ttest_ind(metrics1[metric_name], metrics2[metric_name]).pvalue
    print('{}: p-value = {}'.format(metric_name, p))

100%|██████████| 100/100 [00:00<00:00, 249.85it/s]
100%|██████████| 100/100 [00:00<00:00, 265.92it/s]

roc_auc_score: p-value = 1.823849757561316e-41
precision_score: p-value = 0.015439797565549524
f1_score: p-value = 9.47542289998599e-37
jaccard_score: p-value = 1.3485343834260122e-43
sensitivity_score: p-value = 8.01543071986062e-49
specificity_score: p-value = 0.7880714547717828





#### 2. Segmentation examples (e.g., Dice)

In [23]:
from scipy import stats
from utils import *

df1 = pd.read_csv('results/dice1.csv')
df2 = pd.read_csv('results/dice2.csv')

metric_name = 'dice'

p = stats.ttest_ind(df1[metric_name], df2[metric_name]).pvalue
print('{}: p-value = {}'.format(metric_name, p))

dice: p-value = 0.12907317702410417
