In [1]:
import time
import argparse
import numpy as np
import pandas as pd
import scipy.stats
from dataset import load_data
from Baseline1.tester import Tester as Tester1
from Baseline2.tester import Tester as Tester2
from SupCon.tester import Tester as Tester3
from utils import *
#from pyDeLong import delong_roc_variance
#from compare_auc_delong_xu import *
import torch

# Create a class to simulate command-line arguments
class ArgumentParser(argparse.ArgumentParser):
    def __init__(self, *args, **kwargs):
        super(ArgumentParser, self).__init__(*args, **kwargs)
        self.args = None

    def parse_args(self, args=None, namespace=None):
        if self.args is None:
            return super(ArgumentParser, self).parse_args(args=args, namespace=namespace)
        else:
            return self.parse_known_args(self.args, namespace)
        

def parse_arguments_from_list(args_list):
    parser = ArgumentParser(description='PyTorch Model Training')
    parser.add_argument('--data_dir', type=str, default='../data/', help='Path to the data directory')
    return parser.parse_args(args_list)

In [2]:
#https://github.com/yandexdataschool/roc_comparison
# AUC comparison adapted from https://github.com/Netflix/vmaf/
def compute_midrank(x):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=float)
    # +1 is due to Python using 0-based indexing instead of 1-based in the AUC formula in the paper
    T2[J] = T + 1
    return T2


def fastDeLong(predictions_sorted_transposed, label_1_count):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=float)
    ty = np.empty([k, n], dtype=float)
    tz = np.empty([k, m + n], dtype=float)
    for r in range(k):
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def calc_pvalue(aucs, sigma):
    """Computes log(10) of p-values.
    Args:
       aucs: 1D array of AUCs
       sigma: AUC DeLong covariances
    Returns:
       log10(pvalue)
    """
    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
    pvalue = 2 * (1 - scipy.stats.norm.cdf(z, 0, 1))
    return z, np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)


def compute_ground_truth_statistics(ground_truth):
    assert np.array_equal(np.unique(ground_truth), [0, 1])
    order = (-ground_truth).argsort()
    label_1_count = int(ground_truth.sum())
    return order, label_1_count


def delong_roc_variance(ground_truth, predictions):
    """
    Computes ROC AUC variance for a single set of predictions
    Args:
       ground_truth: np.array of 0 and 1
       predictions: np.array of floats of the probability of being class 1
    """
    order, label_1_count = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = predictions[np.newaxis, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
    return aucs[0], delongcov


def delong_roc_test(ground_truth, predictions_one, predictions_two):
    """
    Computes log(p-value) for hypothesis that two ROC AUCs are different
    Args:
       ground_truth: np.array of 0 and 1
       predictions_one: predictions of the first model,
          np.array of floats of the probability of being class 1
       predictions_two: predictions of the second model,
          np.array of floats of the probability of being class 1
    """
    order, label_1_count = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    return calc_pvalue(aucs, delongcov)

In [3]:
def main(args):
    
    # Load the dataframe from a file and return the loaded dataframe
    [df1_trn, df1_vld, df1_a, df1_b], [df2_trn, df2_vld, df2_a, df2_b], [df3_trn, df3_vld, df3_a, df3_b] = load_data(args)
    
    backup = "/backup02"
    
    tester1_D2D3 = Tester1(df1_b, df1_a, args.data_dir, backup, setup = "Baseline1", exp = "D2D3") # Instantiate the tester 
    tester2_D2D3 = Tester2(df1_b, df1_a, args.data_dir, backup, setup = "Baseline2", exp = "D2D3") # Instantiate the tester 
    tester3_D2D3 = Tester3(df1_b, df1_a, args.data_dir, backup, setup = "SupCon", exp = "D2D3") # Instantiate the tester 
    lbllist1, outlist1, predlist1 = tester1_D2D3.test(tester1_D2D3.ldr_tsta)
    lbllist2, outlist2, predlist2 = tester2_D2D3.test(tester2_D2D3.ldr_tsta)
    lbllist3, outlist3, predlist3 = tester3_D2D3.test(tester3_D2D3.ldr_tsta)
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D1a).')
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D1a).')
    
    lbllist1, outlist1, predlist1 = tester1_D2D3.test(tester1_D2D3.ldr_tstb)
    lbllist2, outlist2, predlist2 = tester2_D2D3.test(tester2_D2D3.ldr_tstb)
    lbllist3, outlist3, predlist3 = tester3_D2D3.test(tester3_D2D3.ldr_tstb)
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D1b).')
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D1b).')
    
    #for i in range(len(lbllist1)) :
    #    print(lbllist1[i], " ", lbllist2[i], " ", lbllist3[i])
    
    tester1_D1D3 = Tester1(df1_b, df1_a, args.data_dir, backup, setup = "Baseline1", exp = "D1D3") # Instantiate the tester 
    tester2_D1D3 = Tester2(df1_b, df1_a, args.data_dir, backup, setup = "Baseline2", exp = "D1D3") # Instantiate the tester 
    tester3_D1D3 = Tester3(df1_b, df1_a, args.data_dir, backup, setup = "SupCon", exp = "D1D3") # Instantiate the tester 
    lbllist1, outlist1, predlist1 = tester1_D1D3.test(tester1_D1D3.ldr_tsta)
    lbllist2, outlist2, predlist2 = tester2_D1D3.test(tester2_D1D3.ldr_tsta)
    lbllist3, outlist3, predlist3 = tester3_D1D3.test(tester3_D1D3.ldr_tsta)
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D2a).')
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D2a).')
    lbllist1, outlist1, predlist1 = tester1_D1D3.test(tester1_D1D3.ldr_tstb)
    lbllist2, outlist2, predlist2 = tester2_D1D3.test(tester2_D1D3.ldr_tstb)
    lbllist3, outlist3, predlist3 = tester3_D1D3.test(tester3_D1D3.ldr_tstb)
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D2b).')
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D2b).')
    
    
    tester1_D1D2 = Tester1(df1_b, df1_a, args.data_dir, backup, setup = "Baseline1", exp = "D1D2") # Instantiate the tester 
    tester2_D1D2 = Tester2(df1_b, df1_a, args.data_dir, backup, setup = "Baseline2", exp = "D1D2") # Instantiate the tester 
    tester3_D1D2 = Tester3(df1_b, df1_a, args.data_dir, backup, setup = "SupCon", exp = "D1D2") # Instantiate the tester 
    lbllist1, outlist1, predlist1 = tester1_D1D2.test(tester1_D1D2.ldr_tsta)
    lbllist2, outlist2, predlist2 = tester2_D1D2.test(tester2_D1D2.ldr_tsta)
    lbllist3, outlist3, predlist3 = tester3_D1D2.test(tester3_D1D2.ldr_tsta)
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D3a).')
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D3a).')
    lbllist1, outlist1, predlist1 = tester1_D1D2.test(tester1_D1D2.ldr_tstb)
    lbllist2, outlist2, predlist2 = tester2_D1D2.test(tester2_D1D2.ldr_tstb)
    lbllist3, outlist3, predlist3 = tester3_D1D2.test(tester3_D1D2.ldr_tstb)
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D3b).')
    z, p_value = delong_roc_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D3b).')
    
    
    
if __name__ == '__main__':
    # Define your desired argument values
    args_list = ['--data_dir', '../data/']
    
    args = parse_arguments_from_list(args_list)
    
    start = time.time()
    main(args)
    end = time.time()
    days, hours, minutes, seconds = getTime(end-start)
    print(f"\n{int(days)} day(s) {int(hours)} hour(s) {int(minutes)} minute(s) {int(seconds)} second(s)")



loading the data ...

Set1 nfiles: 6000 ,  Set2 nfiles: 3140 ,  Set3 nfiles: 3804

done ...

Device:  cuda
loading the model ...

Device:  cuda
loading the model ...

Device:  cuda
loading the model ...

The p-value of [[-29.21482022]] is between Baseline1 and SupCon (Test Set = D1a).
The p-value of [[-52.13225215]] is between Baseline2 and SupCon (Test Set = D1a).
The p-value of [[-30.95712695]] is between Baseline1 and SupCon (Test Set = D1b).
The p-value of [[-51.6125446]] is between Baseline2 and SupCon (Test Set = D1b).
Device:  cuda
loading the model ...

Device:  cuda
loading the model ...

Device:  cuda
loading the model ...

The p-value of [[-1.73024239]] is between Baseline1 and SupCon (Test Set = D2a).
The p-value of [[-1.16300532]] is between Baseline2 and SupCon (Test Set = D2a).
The p-value of [[-0.24429802]] is between Baseline1 and SupCon (Test Set = D2b).
The p-value of [[-1.16412486]] is between Baseline2 and SupCon (Test Set = D2b).
Device:  cuda
loading the model .

In [1]:
from scipy.stats import norm
def delong_test(y_true, y_pred_model1, y_pred_model2):
    """
    Perform DeLong test to compare the AUCs of two models.
    
    Args:
        y_true (array-like): True labels (0 or 1).
        y_pred_model1 (array-like): Predicted probabilities by model 1.
        y_pred_model2 (array-like): Predicted probabilities by model 2.
        
    Returns:
        float: Test statistic (z-score).
        float: Two-sided p-value.
    """
    n = len(y_true)
    y1 = y_pred_model1
    y2 = y_pred_model2
    # Calculate the differences (d) between the predicted probabilities of the two models
    d = y1 - y2
    # Compute the mean difference (c) of the d values
    # The mean difference provides an estimate of the average discrepancy between the models.
    c = np.mean(d)
    # Calculate the variance of the differences (var)
    # The variance measures the spread or variability of the differences between the models. 
    var = np.var(d, ddof=1) # (np.mean(d**2) - np.mean(d)**2) / n
    # Calculate the covariance to quantify the relationship between the predicted probabilities of the two models.
    cov_y1_y2 = np.cov(y1, y2, rowvar=False, ddof=1)[0, 1]
    cov_y1_y_true = np.cov(y1, y_true, rowvar=False, ddof=1)[0, 1]
    cov_y2_y_true = np.cov(y2, y_true, rowvar=False, ddof=1)[0, 1]
    
    cov = cov_y1_y2 - cov_y1_y_true * cov_y2_y_true / np.var(y_true, ddof=1)
    
    # Calculate the test statistic (z)
    # The test statistic measures the number of standard deviations the mean difference is away from zero.
    z = c / np.sqrt(var)
    
    # Compute the p-value using the cumulative distribution function (CDF) of the standard normal distribution
    p_value = 2 * (1 - norm.cdf(np.abs(z)))
    
    return z, p_value


def main(args):
    
    # Load the dataframe from a file and return the loaded dataframe
    [df1_trn, df1_vld, df1_a, df1_b], [df2_trn, df2_vld, df2_a, df2_b], [df3_trn, df3_vld, df3_a, df3_b] = load_data(args)
    
    tester1_D2D3 = Tester1(df1_b, df1_a, args.data_dir, setup = "Baseline1", exp = "D2D3") # Instantiate the tester 
    tester2_D2D3 = Tester2(df1_b, df1_a, args.data_dir, setup = "Baseline2", exp = "D2D3") # Instantiate the tester 
    tester3_D2D3 = Tester3(df1_b, df1_a, args.data_dir, setup = "SupCon", exp = "D2D3") # Instantiate the tester 
    lbllist1, outlist1, predlist1 = tester1_D2D3.test(tester1_D2D3.ldr_tsta)
    lbllist2, outlist2, predlist2 = tester2_D2D3.test(tester2_D2D3.ldr_tsta)
    lbllist3, outlist3, predlist3 = tester3_D2D3.test(tester3_D2D3.ldr_tsta)
    z, p_value = delong_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D1a).')
    z, p_value = delong_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D1a).')
    
    lbllist1, outlist1, predlist1 = tester1_D2D3.test(tester1_D2D3.ldr_tstb)
    lbllist2, outlist2, predlist2 = tester2_D2D3.test(tester2_D2D3.ldr_tstb)
    lbllist3, outlist3, predlist3 = tester3_D2D3.test(tester3_D2D3.ldr_tstb)
    z, p_value = delong_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D1b).')
    z, p_value = delong_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D1b).')
    
    #for i in range(len(lbllist1)) :
    #    print(lbllist1[i], " ", lbllist2[i], " ", lbllist3[i])
    
    tester1_D1D3 = Tester1(df1_b, df1_a, args.data_dir, setup = "Baseline1", exp = "D1D3") # Instantiate the tester 
    tester2_D1D3 = Tester2(df1_b, df1_a, args.data_dir, setup = "Baseline2", exp = "D1D3") # Instantiate the tester 
    tester3_D1D3 = Tester3(df1_b, df1_a, args.data_dir, setup = "SupCon", exp = "D1D3") # Instantiate the tester 
    lbllist1, outlist1, predlist1 = tester1_D1D3.test(tester1_D1D3.ldr_tsta)
    lbllist2, outlist2, predlist2 = tester2_D1D3.test(tester2_D1D3.ldr_tsta)
    lbllist3, outlist3, predlist3 = tester3_D1D3.test(tester3_D1D3.ldr_tsta)
    z, p_value = delong_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D2a).')
    z, p_value = delong_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D2a).')
    lbllist1, outlist1, predlist1 = tester1_D1D3.test(tester1_D1D3.ldr_tstb)
    lbllist2, outlist2, predlist2 = tester2_D1D3.test(tester2_D1D3.ldr_tstb)
    lbllist3, outlist3, predlist3 = tester3_D1D3.test(tester3_D1D3.ldr_tstb)
    z, p_value = delong_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D2b).')
    z, p_value = delong_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D2b).')
    
    
    tester1_D1D2 = Tester1(df1_b, df1_a, args.data_dir, setup = "Baseline1", exp = "D1D2") # Instantiate the tester 
    tester2_D1D2 = Tester2(df1_b, df1_a, args.data_dir, setup = "Baseline2", exp = "D1D2") # Instantiate the tester 
    tester3_D1D2 = Tester3(df1_b, df1_a, args.data_dir, setup = "SupCon", exp = "D1D2") # Instantiate the tester 
    lbllist1, outlist1, predlist1 = tester1_D1D2.test(tester1_D1D2.ldr_tsta)
    lbllist2, outlist2, predlist2 = tester2_D1D2.test(tester2_D1D2.ldr_tsta)
    lbllist3, outlist3, predlist3 = tester3_D1D2.test(tester3_D1D2.ldr_tsta)
    z, p_value = delong_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D3a).')
    z, p_value = delong_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D3a).')
    lbllist1, outlist1, predlist1 = tester1_D1D2.test(tester1_D1D2.ldr_tstb)
    lbllist2, outlist2, predlist2 = tester2_D1D2.test(tester2_D1D2.ldr_tstb)
    lbllist3, outlist3, predlist3 = tester3_D1D2.test(tester3_D1D2.ldr_tstb)
    z, p_value = delong_test(lbllist1.numpy(), outlist1.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline1 and SupCon (Test Set = D3b).')
    z, p_value = delong_test(lbllist1.numpy(), outlist2.numpy(), outlist3.numpy())
    print(f'The p-value of {p_value} is between Baseline2 and SupCon (Test Set = D3b).')
    
    
    
if __name__ == '__main__':
    # Define your desired argument values
    args_list = ['--data_dir', '../data/']
    
    args = parse_arguments_from_list(args_list)
    
    start = time.time()
    main(args)
    end = time.time()
    days, hours, minutes, seconds = getTime(end-start)
    print(f"\n{int(days)} day(s) {int(hours)} hour(s) {int(minutes)} minute(s) {int(seconds)} second(s)")



loading the data ...

Set1 nfiles: 6000 ,  Set2 nfiles: 3140 ,  Set3 nfiles: 3804

done ...

Device:  cuda
loading the model ...

Device:  cuda
loading the model ...

Device:  cuda
loading the model ...

The p-value of 0.9390213328968655 is between Baseline1 and SupCon (Test Set = D1a).
The p-value of 0.7401724286359563 is between Baseline2 and SupCon (Test Set = D1a).
The p-value of 0.9405679594159173 is between Baseline1 and SupCon (Test Set = D1b).
The p-value of 0.7365170938916059 is between Baseline2 and SupCon (Test Set = D1b).
Device:  cuda
loading the model ...

Device:  cuda
loading the model ...

Device:  cuda
loading the model ...

The p-value of 0.9195823127288063 is between Baseline1 and SupCon (Test Set = D2a).
The p-value of 0.5876662225043936 is between Baseline2 and SupCon (Test Set = D2a).
The p-value of 0.9319718000559429 is between Baseline1 and SupCon (Test Set = D2b).
The p-value of 0.5860705537889579 is between Baseline2 and SupCon (Test Set = D2b).
Device:  cud

In [2]:
# Define your desired argument values
args_list = ['--data_dir', '../data/']
args = parse_arguments_from_list(args_list)
# Load the dataframe from a file and return the loaded dataframe
[df1_trn, df1_vld, df1_a, df1_b], [df2_trn, df2_vld, df2_a, df2_b], [df3_trn, df3_vld, df3_a, df3_b] = load_data(args)

tester1_D2D3 = Tester1(df1_b, df1_a, args.data_dir, setup = "Baseline1", exp = "D2D3") # Instantiate the tester 
tester2_D2D3 = Tester2(df1_b, df1_a, args.data_dir, setup = "Baseline2", exp = "D2D3") # Instantiate the tester 
tester3_D2D3 = Tester3(df1_b, df1_a, args.data_dir, setup = "SupCon", exp = "D2D3") # Instantiate the tester 
lbllist1, outlist1, predlist1 = tester1_D2D3.test(tester1_D2D3.ldr_tsta)
lbllist2, outlist2, predlist2 = tester2_D2D3.test(tester2_D2D3.ldr_tsta)
lbllist3, outlist3, predlist3 = tester3_D2D3.test(tester3_D2D3.ldr_tsta)


loading the data ...

Set1 nfiles: 6000 ,  Set2 nfiles: 3140 ,  Set3 nfiles: 3804

done ...

Device:  cuda
loading the model ...

Device:  cuda
loading the model ...

Device:  cuda
loading the model ...



In [6]:
outlist1

tensor([ 23.1852,  -3.6657,  22.4458,  ..., -14.8234, -11.8361, -12.9829])

In [7]:
import numpy as np
from sklearn.metrics import roc_auc_score
from scipy.stats import norm

# Step 1: Obtain predicted probabilities for each model
model1_probs = outlist1# predicted probabilities for model 1
model2_probs = outlist3# predicted probabilities for model 2

# Step 2: Extract true labels
true_labels = lbllist1# true labels from test dataset

# Step 3: Compute AUCs
auc_model1 = roc_auc_score(true_labels, model1_probs)
auc_model2 = roc_auc_score(true_labels, model2_probs)

# Step 4: Perform DeLong test
# Calculate the differences in predicted probabilities
diff_probs = model1_probs - model2_probs

# Calculate the variance of the differences
var_diff_probs = np.var(diff_probs)

# Calculate the AUC covariance
cov_auc = np.cov(true_labels, diff_probs)[0, 1]

# Calculate the Z-score
z_score = cov_auc / np.sqrt(var_diff_probs)

# Calculate the p-value
p_value = 2 * (1 - norm.cdf(np.abs(z_score)))

# Print the AUCs and p-value
print("AUC Model 1:", auc_model1)
print("AUC Model 2:", auc_model2)
print("p-value:", p_value)

TypeError: var() received an invalid combination of arguments - got (ddof=int, dtype=NoneType, out=NoneType, axis=NoneType, ), but expected one of:
 * (tuple of ints dim, bool unbiased, bool keepdim)
 * (tuple of ints dim, *, int correction, bool keepdim)
      didn't match because some of the keywords were incorrect: ddof, dtype, out, axis
 * (bool unbiased)
 * (tuple of names dim, bool unbiased, bool keepdim)
 * (tuple of names dim, *, int correction, bool keepdim)
      didn't match because some of the keywords were incorrect: ddof, dtype, out, axis
