In [1]:
import numpy as np 
import pandas as pd 
from glob import glob
from scipy import stats
from sklearn.metrics import confusion_matrix, classification_report
from scipy.stats import ttest_ind


In [2]:
inceptionResNetV2_cm=np.array([[36,16],[55,127]])
efficientNetB2_cm=np.array([[43,9],[43,139]])
efficientNetB3_cm=np.array([[37,15],[37,145]])
y_test=np.ones([52])
y_test=np.concatenate([y_test,np.zeros([182])])
inceptionResNetV2_pred=np.zeros([len(y_test)])
inceptionResNetV2_pred[:inceptionResNetV2_cm[0,0]]=1
inceptionResNetV2_pred[52:52+inceptionResNetV2_cm[1,0]]=1
efficientNetB2_pred=np.zeros([len(y_test)])
efficientNetB2_pred[:efficientNetB2_cm[0,0]]=1
efficientNetB2_pred[52:52+efficientNetB2_cm[1,0]]=1
efficientNetB3_pred=np.zeros([len(y_test)])
efficientNetB3_pred[:efficientNetB3_cm[0,0]]=1
efficientNetB3_pred[52:52+efficientNetB3_cm[1,0]]=1

In [3]:
import pandas as pd
from scipy.stats import t, ttest_ind, f_oneway
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 가상의 예제 데이터 생성
data_model1 = inceptionResNetV2_pred
data_model2 = efficientNetB2_pred
data_model3 = efficientNetB3_pred

# 이진분류 모델 3개의 결과를 각각 저장
results_model1 = data_model1 > 0
results_model2 = data_model2 > 0
results_model3 = data_model3 > 0

true_labels = y_test

# 성능 지표 계산 함수
def calculate_performance_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    f1 = f1_score(y_true, y_pred)
    return accuracy, sensitivity, specificity, f1

alpha_values = [0.2, 0.15, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001]

# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Model', 'Metric', 'Value','Alpha', 'Confidence Interval', 'p-value', 'ANOVA with bonferroni collection p-value'])

# 각 모델에 대한 신뢰구간, p-value, ANOVA p-value 계산 및 DataFrame에 저장
for model, results in zip(['Model 1', 'Model 2', 'Model 3'], [results_model1, results_model2, results_model3]):
    accuracy, sensitivity, specificity, f1 = calculate_performance_metrics(true_labels, results)
    
    # Calculate p-value using t-test
    t_statistic, p_value_ttest = ttest_ind(results, true_labels)
    
    # Perform ANOVA
    _, p_value_anova = f_oneway(results_model1, results_model2, results_model3)
    
    # Apply Bonferroni correction to ANOVA p-value
    _, p_value_anova_corrected, _, _ = multipletests([p_value_anova], method='bonferroni')
    
    for metric_name, metric_value in zip(['Accuracy', 'Sensitivity', 'Specificity', 'F1 Score'], [accuracy, sensitivity, specificity, f1]):
        for alpha in alpha_values:
            # t.interval 함수를 사용하여 신뢰구간 계산
            df = len(results) - 1
            confidence_interval = t.interval(1 - alpha, df, loc=metric_value, scale=np.std(results) / np.sqrt(len(results)))
            
            # 결과를 DataFrame에 추가
            result_df = pd.concat([result_df, pd.DataFrame({
                'Model': [model],
                'Metric': [metric_name],
                'Value': [metric_value],
                'Alpha':[alpha],
                'Confidence Interval': [confidence_interval],
                'p-value': [p_value_ttest],
                'ANOVA with bonferroni collection p-value': [p_value_anova_corrected]
            })])

# 결과 DataFrame 출력
print(result_df)
# 결과 DataFrame을 CSV 파일로 저장 (필요에 따라 주석 처리)
result_df.to_csv('model_comparison_results.csv', index=False)



      Model    Metric     Value  Alpha  \
0   Model 1  Accuracy  0.696581  0.200   
0   Model 1  Accuracy  0.696581  0.150   
0   Model 1  Accuracy  0.696581  0.100   
0   Model 1  Accuracy  0.696581  0.050   
0   Model 1  Accuracy  0.696581  0.025   
..      ...       ...       ...    ...   
0   Model 3  F1 Score  0.587302  0.050   
0   Model 3  F1 Score  0.587302  0.025   
0   Model 3  F1 Score  0.587302  0.010   
0   Model 3  F1 Score  0.587302  0.005   
0   Model 3  F1 Score  0.587302  0.001   

                         Confidence Interval   p-value  \
0   (0.6556236315641998, 0.7375387615981933)  0.000083   
0   (0.6505534040262204, 0.7426089891361727)  0.000083   
0   (0.6439525289011624, 0.7492098642612307)  0.000083   
0   (0.6337934763835406, 0.7593689167788525)  0.000083   
0   (0.6246859388785894, 0.7684764542838038)  0.000083   
..                                       ...       ...   
0    (0.527410547862424, 0.6471926267407504)  0.021846   
0   (0.5187231821452809, 0.6558

  result_df = pd.concat([result_df, pd.DataFrame({


In [31]:
import pandas as pd
from scipy.stats import t, ttest_ind, f_oneway
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import pairwise_distances
from sklearn.utils import check_array
from scipy.stats import norm
# DeLong method for comparing ROC curves
def delong_roc_test(y_true, y_scores1, y_scores2):
    fpr1, tpr1, _ = roc_curve(y_true, y_scores1)
    fpr2, tpr2, _ = roc_curve(y_true, y_scores2)

    auc1 = auc(fpr1, tpr1)
    auc2 = auc(fpr2, tpr2)
    roc_distances = pairwise_distances(
        y_true.reshape(-1, 1), y_scores1.reshape(-1, 1), metric="manhattan"
    ) - pairwise_distances(y_true.reshape(-1, 1), y_scores2.reshape(-1, 1), metric="manhattan")

    delong_variance = np.var(roc_distances, ddof=2) / 4.0

    delong_se = np.sqrt(delong_variance)

    z_score = (auc1 - auc2) / delong_se

    p_value = 2 * (1.0 - norm.cdf(np.abs(z_score)))

    return z_score, p_value

# 가상의 예제 데이터 생성
data_model1 = inceptionResNetV2_pred
data_model2 = efficientNetB2_pred
data_model3 = efficientNetB3_pred

# 이진분류 모델 3개의 결과를 각각 저장
results_model1 = data_model1 > 0
results_model2 = data_model2 > 0
results_model3 = data_model3 > 0

true_labels = y_test
result_list = []
# 성능 지표 계산 함수
def calculate_performance_metrics(y_true, y_pred, y_scores):
    accuracy = accuracy_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    f1 = f1_score(y_true, y_pred)
    auc_value = roc_auc_score(y_true, y_scores)
    return accuracy, sensitivity, specificity, f1, auc_value

alpha_values = [0.2, 0.15, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001]

# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Model', 'Metric', 'Value', 'Alpha', 'Confidence Interval', 'p-value', 'ANOVA with bonferroni collection p-value', 'DeLong p-value'])

# 각 모델에 대한 신뢰구간, p-value, ANOVA p-value, DeLong p-value 계산 및 리스트에 저장
for model, results, data_model in zip(['Model 1', 'Model 2', 'Model 3'], [results_model1, results_model2, results_model3], [data_model1, data_model2, data_model3]):
    accuracy, sensitivity, specificity, f1, auc_value = calculate_performance_metrics(true_labels, results, data_model)
    
    # Calculate p-value using t-test
    t_statistic, p_value_ttest = ttest_ind(results, true_labels)
    
    # Perform ANOVA
    _, p_value_anova = f_oneway(results_model1, results_model2, results_model3)
    
    # Apply Bonferroni correction to ANOVA p-value
    _, p_value_anova_corrected, _, _ = multipletests([p_value_anova], method='bonferroni')
    
    for metric_name, metric_value in zip(['Accuracy', 'Sensitivity', 'Specificity', 'F1 Score', 'AUC'], [accuracy, sensitivity, specificity, f1, auc_value]):
        for alpha in alpha_values:
            # t.interval 함수를 사용하여 신뢰구간 계산
            df = len(results) - 1
            confidence_interval = t.interval(1 - alpha, df, loc=metric_value, scale=np.std(results) / np.sqrt(len(results)))
            
            # 결과를 리스트에 추가
            result_list.append({
                'Model': model,
                'Metric': metric_name,
                'Value': metric_value,
                'Alpha': alpha,
                'Confidence Interval': confidence_interval,
                'p-value': p_value_ttest,
                'ANOVA with bonferroni collection p-value': p_value_anova_corrected,
                'DeLong p-value': np.nan  # Placeholder for DeLong p-value
            })

# Calculate DeLong p-value for pairwise comparisons
for i in range(len(result_list)):
    for j in range(i + 1, len(result_list)):
        if result_list[i]['Metric'] == result_list[j]['Metric'] and result_list[i]['Alpha'] == result_list[j]['Alpha']:
            metric_name = result_list[i]['Metric']
            alpha = result_list[i]['Alpha']
            model1 = result_list[i]['Model']
            model2 = result_list[j]['Model']
            y_scores1 = data_model1 if model1 == 'Model 1' else (data_model2 if model1 == 'Model 2' else data_model3)
            y_scores2 = data_model1 if model2 == 'Model 1' else (data_model2 if model2 == 'Model 2' else data_model3)
            _, delong_p_value = delong_roc_test(true_labels, y_scores1, y_scores2)
            result_list[i]['DeLong p-value'] = delong_p_value
            result_list[j]['DeLong p-value'] = delong_p_value

# 결과 리스트로부터 DataFrame 생성
result_df = pd.DataFrame(result_list)

# 결과 DataFrame 출력
print(result_df)
# 결과 DataFrame을 CSV 파일로 저장 (필요에 따라 주석 처리)
result_df.to_csv('model_comparison_results.csv', index=False)


       Model    Metric     Value  Alpha  \
0    Model 1  Accuracy  0.696581  0.200   
1    Model 1  Accuracy  0.696581  0.150   
2    Model 1  Accuracy  0.696581  0.100   
3    Model 1  Accuracy  0.696581  0.050   
4    Model 1  Accuracy  0.696581  0.025   
..       ...       ...       ...    ...   
115  Model 3       AUC  0.754121  0.050   
116  Model 3       AUC  0.754121  0.025   
117  Model 3       AUC  0.754121  0.010   
118  Model 3       AUC  0.754121  0.005   
119  Model 3       AUC  0.754121  0.001   

                          Confidence Interval   p-value  \
0    (0.6556236315641998, 0.7375387615981933)  0.000083   
1    (0.6505534040262204, 0.7426089891361727)  0.000083   
2    (0.6439525289011624, 0.7492098642612307)  0.000083   
3    (0.6337934763835406, 0.7593689167788525)  0.000083   
4    (0.6246859388785894, 0.7684764542838038)  0.000083   
..                                        ...       ...   
115   (0.694229839681716, 0.8140119185600424)  0.021846   
116  (0.685

In [27]:
result_df.loc[0, 'Metric']

0    Accuracy
0    Accuracy
0    Accuracy
0    Accuracy
0    Accuracy
       ...   
0         AUC
0         AUC
0         AUC
0         AUC
0         AUC
Name: Metric, Length: 120, dtype: object

In [78]:
import numpy as np
from scipy.stats import t, ttest_ind
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 가상의 예제 데이터 생성
np.random.seed(42)
data_model1 = np.random.normal(loc=0, scale=1, size=100)
data_model2 = np.random.normal(loc=0.5, scale=1, size=100)
data_model3 = np.random.normal(loc=1, scale=1, size=100)

# 이진분류 모델 3개의 결과를 각각 저장
results_model1 = (data_model1 > 0).astype(int)
results_model2 = (data_model2 > 0).astype(int)
results_model3 = (data_model3 > 0).astype(int)

# 실제 라벨 (예시로 모두 1로 설정)
true_labels = np.ones_like(data_model1)

# 성능 지표 계산 함수
def calculate_performance_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

# 각 모델에 대한 t-test 및 p-value 계산 및 출력
for model, results in zip(['Model 1', 'Model 2', 'Model 3'], [results_model1, results_model2, results_model3]):
    accuracy, precision, recall, f1 = calculate_performance_metrics(true_labels, results)
    
    print(f"\nT-test and p-value for {model}:")
    for metric_name, metric_value in zip(['Accuracy', 'Precision', 'Recall', 'F1 Score'], [accuracy, precision, recall, f1]):
        print(f"  {metric_name}: {metric_value:.2%}")
        t_statistic, p_value = ttest_ind(results, true_labels)
        print(f"    t-statistic: {t_statistic}, p-value: {p_value}")



T-test and p-value for Model 1:
  Accuracy: 46.00%
    t-statistic: -10.780417028313323, p-value: 1.2866965296964952e-21
  Precision: 100.00%
    t-statistic: -10.780417028313323, p-value: 1.2866965296964952e-21
  Recall: 46.00%
    t-statistic: -10.780417028313323, p-value: 1.2866965296964952e-21
  F1 Score: 63.01%
    t-statistic: -10.780417028313323, p-value: 1.2866965296964952e-21

T-test and p-value for Model 2:
  Accuracy: 68.00%
    t-statistic: -6.825557507934252, p-value: 1.0449223161892874e-10
  Precision: 100.00%
    t-statistic: -6.825557507934252, p-value: 1.0449223161892874e-10
  Recall: 68.00%
    t-statistic: -6.825557507934252, p-value: 1.0449223161892874e-10
  F1 Score: 80.95%
    t-statistic: -6.825557507934252, p-value: 1.0449223161892874e-10

T-test and p-value for Model 3:
  Accuracy: 85.00%
    t-statistic: -4.179783276115416, p-value: 4.377094840750872e-05
  Precision: 100.00%
    t-statistic: -4.179783276115416, p-value: 4.377094840750872e-05
  Recall: 85.00%


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
