In [3]:
import time
import argparse
import numpy as np
import pandas as pd
from dataset import load_data
from Baseline1.tester import Tester as Tester1
from Baseline2.tester import Tester as Tester2
from SupCon.tester import Tester as Tester3
from utils import *
#from pyDeLong import delong_roc_variance
from compare_auc_delong_xu import *
import torch

# Create a class to simulate command-line arguments
class ArgumentParser(argparse.ArgumentParser):
    def __init__(self, *args, **kwargs):
        super(ArgumentParser, self).__init__(*args, **kwargs)
        self.args = None

    def parse_args(self, args=None, namespace=None):
        if self.args is None:
            return super(ArgumentParser, self).parse_args(args=args, namespace=namespace)
        else:
            return self.parse_known_args(self.args, namespace)
        

def parse_arguments_from_list(args_list):
    parser = ArgumentParser(description='PyTorch Model Training')
    parser.add_argument('--data_dir', type=str, default='../data/', help='Path to the data directory')
    return parser.parse_args(args_list)


from scipy.stats import norm
def delong_test(y_true, y_pred_model1, y_pred_model2):
    """
    Perform DeLong test to compare the AUCs of two models.
    
    Args:
        y_true (array-like): True labels (0 or 1).
        y_pred_model1 (array-like): Predicted probabilities by model 1.
        y_pred_model2 (array-like): Predicted probabilities by model 2.
        
    Returns:
        float: Test statistic (z-score).
        float: Two-sided p-value.
    """
    n = len(y_true)
    y1 = y_pred_model1
    y2 = y_pred_model2
    # Calculate the differences (d) between the predicted probabilities of the two models
    d = y1 - y2
    # Compute the mean difference (c) of the d values
    # The mean difference provides an estimate of the average discrepancy between the models.
    c = np.mean(d)
    # Calculate the variance of the differences (var)
    # The variance measures the spread or variability of the differences between the models. 
    var = np.var(d, ddof=1) # (np.mean(d**2) - np.mean(d)**2) / n
    # Calculate the covariance to quantify the relationship between the predicted probabilities of the two models.
    cov_y1_y2 = np.cov(y1, y2, rowvar=False, ddof=1)[0, 1]
    cov_y1_y_true = np.cov(y1, y_true, rowvar=False, ddof=1)[0, 1]
    cov_y2_y_true = np.cov(y2, y_true, rowvar=False, ddof=1)[0, 1]
    
    cov = cov_y1_y2 - cov_y1_y_true * cov_y2_y_true / np.var(y_true, ddof=1)
    
    # Calculate the test statistic (z)
    # The test statistic measures the number of standard deviations the mean difference is away from zero.
    z = c / np.sqrt(var)
    
    # Compute the p-value using the cumulative distribution function (CDF) of the standard normal distribution
    p_value = 2 * (1 - norm.cdf(np.abs(z)))
    
    return z, p_value


def main(args):
    
    # Load the dataframe from a file and return the loaded dataframe
    [df1_trn, df1_vld, df1_a, df1_b], [df2_trn, df2_vld, df2_a, df2_b], [df3_trn, df3_vld, df3_a, df3_b] = load_data(args)
    
    tester1_D2D3 = Tester1(df1_b, df1_a, args.data_dir, setup = "Baseline1", exp = "D2D3") # Instantiate the tester 
    tester2_D2D3 = Tester2(df1_b, df1_a, args.data_dir, setup = "Baseline2", exp = "D2D3") # Instantiate the tester 
    tester3_D2D3 = Tester3(df1_b, df1_a, args.data_dir, setup = "SupCon", exp = "D2D3") # Instantiate the tester 
    lbllist1, outlist1, predlist1 = tester1_D2D3.test(tester1_D2D3.ldr_tstb)
    lbllist2, outlist2, predlist2 = tester2_D2D3.test(tester2_D2D3.ldr_tstb)
    lbllist3, outlist3, predlist3 = tester3_D2D3.test(tester3_D2D3.ldr_tstb)
    
    #for i in range(len(lbllist1)) :
    #    print(lbllist1[i], " ", lbllist2[i], " ", lbllist3[i])
    
    z, p_value = delong_test(lbllist1.numpy(), outlist1.numpy(), outlist2.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and Baseline2.')
    
    z, p_value = delong_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon.')
    
    z, p_value = delong_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon.')
    

    #tester1_D1D3 = Tester1(df1_b, df1_a, args.data_dir, setup = "Baseline1", exp = "D1D3") # Instantiate the tester 
    #tester2_D1D3 = Tester2(df1_b, df1_a, args.data_dir, setup = "Baseline2", exp = "D1D3") # Instantiate the tester 
    #tester3_D1D3 = Tester3(df1_b, df1_a, args.data_dir, setup = "SupCon", exp = "D1D3") # Instantiate the tester 
    #lbllist1, outlist1, predlist1 = tester1_D1D3.test(tester1_D1D3.ldr_tstb)
    #lbllist2, outlist2, predlist2 = tester2_D1D3.test(tester2_D1D3.ldr_tstb)
    #lbllist3, outlist3, predlist3 = tester3_D1D3.test(tester3_D1D3.ldr_tstb)
    
    #tester1_D1D2 = Tester1(df1_b, df1_a, args.data_dir, setup = "Baseline1", exp = "D1D2") # Instantiate the tester 
    #tester2_D1D2 = Tester2(df1_b, df1_a, args.data_dir, setup = "Baseline2", exp = "D1D2") # Instantiate the tester 
    #tester3_D1D2 = Tester3(df1_b, df1_a, args.data_dir, setup = "SupCon", exp = "D1D2") # Instantiate the tester 
    #lbllist1, outlist1, predlist1 = tester1_D1D2.test(tester1_D1D2.ldr_tstb)
    #lbllist2, outlist2, predlist2 = tester2_D1D2.test(tester2_D1D2.ldr_tstb)
    #lbllist3, outlist3, predlist3 = tester3_D1D2.test(tester3_D1D2.ldr_tstb)
    
if __name__ == '__main__':
    # Define your desired argument values
    args_list = ['--data_dir', '../data/']
    
    args = parse_arguments_from_list(args_list)
    
    start = time.time()
    main(args)
    end = time.time()
    days, hours, minutes, seconds = getTime(end-start)
    print(f"\n{int(days)} day(s) {int(hours)} hour(s) {int(minutes)} minute(s) {int(seconds)} second(s)")



loading the data ...

Set1 nfiles: 6000 ,  Set2 nfiles: 3140 ,  Set3 nfiles: 3804

done ...

Device:  cuda
loading the model ...

Device:  cuda
loading the model ...

Device:  cuda
loading the model ...

The p-value of 0.5444492461429207 is between Baseline1 and Baseline2.
The p-value of 0.9405679594159173 is between Baseline1 and SupCon.
The p-value of 0.7365170938916059 is between Baseline2 and SupCon.

0 day(s) 0 hour(s) 1 minute(s) 19 second(s)
