In [21]:
import pandas as pd
import scipy.stats
from sklearn.metrics import roc_curve, auc
import numpy as np
import compare_auc_delong_xu
import random
import matplotlib.pyplot as plt
from matplotlib.transforms import blended_transform_factory
from scipy.stats import beta
from ipywidgets import interact, FloatSlider, Layout

In [22]:
#!pipreqsnb .

In [23]:

def delong_p_value(y, y_hat_1, y_hat_2):
    return 10 ** compare_auc_delong_xu.delong_roc_test(y, y_hat_1, y_hat_2)[[0]].item()

def roc_measures(Y, Y_hat):
    fpr, tpr, thresholds = roc_curve(Y, Y_hat)
    roc_auc = auc(fpr, tpr)
    return fpr, tpr, thresholds, roc_auc

In [24]:
def beta_to_predata(a1, b1, a0, b0, n_sim = 1000):
    # sample from a beta distribution
    prob_cases = np.random.beta(a1, b1, size = int(n_sim))
    prob_controls = np.random.beta(a0, b0, size = int(n_sim))

    # prepare the simulated data
    y = np.array([1] * int(n_sim) + [0] * int(n_sim), dtype=int)
    y_cases_controls = np.vstack([y, np.concatenate([prob_cases, prob_controls])]).T

    return y_cases_controls

def predata_to_data(predataA, predataB):
    return np.concatenate([predataA, predataB[:,1].reshape(-1,1)], axis = 1)

# test
#predataA = beta_to_predata(1, 1, 1, 1, 1000)
#predataB = beta_to_predata(1, 1, 1, 1, 1000)
#data = predata_to_data(predataA, predataB)
#data

In [25]:
def predata_to_auc(predata):
        _, _, _, auc_res = roc_measures(predata[:,0], predata[:,1])
        return auc_res

In [26]:
def data_to_pvals(data, n_sim, sample_sizes):
    # Prepare to store the all the p-values
    sim_res = []

    # Loop over the number of simulations
    for _ in range(n_sim):
        # Sample with replacement ss indices of the test set (where ss is one of the sample sizes)
        ids_list = [random.choices(range(len(data)), k=ss) for ss in sample_sizes]
        # Compute a p-value for each sample size
        sim_res.append([delong_p_value(data[ids,0],  data[ids,1], data[ids,2]) for ids in ids_list])

    # Convert the list to a numpy array
    sim_res = np.array(sim_res)

    return sim_res

#data_to_pvals(data, 1000, np.array([100, 200, 300]))

In [27]:
def plot_beta(a1, b1, a2, b2, n_sim):
    x1 = np.linspace(0, 1, 1000)
    x2 = np.linspace(0, 1, 1000)
    y1 = beta.pdf(x1, a1, b1)
    y2 = beta.pdf(x2, a2, b2)

    plt.figure(figsize=(6, 6))
    plt.plot(x1, y1, lw=2, label='Cases')
    plt.plot(x2, y2, lw=2, label='Controls')
    plt.xlabel('Predicted probability')
    plt.ylabel('Density')
    plt.grid(True)

    pre_dataA = beta_to_predata(a1, b1, a2, b2, n_sim)
    auc_res = data_to_auc(pre_dataA, mod='A')
    
    plt.legend()
    plt.title(f'Beta distributions with ($\\alpha_1$={a1}, $\\beta_1$={b1}), and ($\\alpha_2$={a2}, $\\beta_2$={b2})\nThe AUC is {auc_res:.2f} (based on {n_sim} simulations)')
    # set a subtitle
    plt.suptitle('Model A')
    plt.show()
    
#plot_beta(2,3,4,5, 1000)

In [28]:
def plot_beta(n_sim, alpha_t,
              ss,
              a0A, b0A, a1A, b1A,
              a0B, b0B, a1B, b1B,
              ):
    
    n_sim = int(n_sim)

    plt.figure(figsize=(12, 12))

    # Model A
    x0A = np.linspace(0, 1, 1000)
    x1A = np.linspace(0, 1, 1000)
    y0A = beta.pdf(x0A, a0A, b0A)
    y1A = beta.pdf(x1A, a1A, b1A)

    ax1 = plt.subplot2grid((3, 2), (0, 0))
    ax1.plot(x0A, y0A, lw=2, label='Controls')
    ax1.plot(x1A, y1A, lw=2, label='Cases')
    ax1.set_xlabel('Predicted probability')
    ax1.set_ylabel('Density')
    ax1.grid(True)

    pre_dataA = beta_to_predata(a1A, b1A, a0A, b0A, n_sim)
    auc_resA = predata_to_auc(pre_dataA)

    ax1.legend()
    ax1.set_title(f'Beta distributions for Model A\nControls: ($\\alpha_0$={a0A:.1f}, $\\beta_0$={b0A:.1f}); Cases: ($\\alpha_1$={a1A:.1f}, $\\beta_1$={b1A:.1f})\nBased on {n_sim} simulations the AUC is {auc_resA:.2f}')

    # Model B
    x0B = np.linspace(0, 1, 1000)
    x1B = np.linspace(0, 1, 1000)
    y0B = beta.pdf(x0B, a0B, b0B)
    y1B = beta.pdf(x1B, a1B, b1B)

    ax2 = plt.subplot2grid((3, 2), (0, 1))
    ax2.plot(x0B, y0B, lw=2, label='Controls')
    ax2.plot(x1B, y1B, lw=2, label='Cases')
    ax2.set_xlabel('Predicted probability')
    ax2.set_ylabel('Density')
    ax2.grid(True)

    pre_dataB = beta_to_predata(a1B, b1B, a0B, b0B, n_sim)
    auc_resB = predata_to_auc(pre_dataB)

    ax2.legend()
    ax2.set_title(f'Beta distributions for Model B\nControls: ($\\alpha_0$={a0B:.1f}, $\\beta_0$={b0B:.1f}); Cases: ($\\alpha_1$={a1B:.1f}, $\\beta_1$={b1B:.1f})\n Based on {n_sim} simulations the AUC is {auc_resB:.2f} ')

    # Simulation plot for power
    ax3 = plt.subplot2grid((3, 2), (1, 0), colspan=2)

    # Simulation plot for power
    data = predata_to_data(pre_dataA, pre_dataB)
    sample_sizes = np.array([int(.5*ss), int(ss), int(1.5*ss)])
    sim_res = data_to_pvals(data, n_sim, sample_sizes)
    mean_pvals = sim_res.mean(axis=0)
    powers = (sim_res < alpha_t).mean(axis=0)

    # Let's prepare the data for the plot
    x = np.concatenate([np.repeat(sample_size, n_sim) for sample_size in sample_sizes])

    # Add some jitter to the x values
    l = sample_sizes[1] - sample_sizes[0]
    jit = np.random.uniform(-l*.1, l*.1, len(x))
    x = x + jit

    # We'll show the p-values on a log scale
    y = np.log(sim_res.flatten(order='F'))

    ax3.scatter(x, y, alpha=0.025)

    # add mean pvals as dots
    ax3.scatter(sample_sizes, np.log(mean_pvals), s=50, label='Mean log(p-value)')

    # add a regression line between sample_sizes and np.log(mean_pvals)
    m, b = np.polyfit(sample_sizes, np.log(mean_pvals), 1)
    ax3.plot(sample_sizes, m*sample_sizes + b, color='red', label='Line fitted on mean log(p-value)')

    # Add a title
    ax3.text(0.5, 1.2, f'Power to detect a difference in discrimination (based on {n_sim} simulations)', transform=ax3.transAxes, ha='center', va='bottom', fontsize=14)

    # add power as text
    transform = blended_transform_factory(ax3.transData, ax3.transAxes)
    ax3.text(sample_sizes[0], 1.02, f'At n={sample_sizes[0]}\nPower is {powers[0]*100:.1f}%', transform=transform, ha='center', va='bottom')
    ax3.text(sample_sizes[1], 1.02, f'At n={sample_sizes[1]}\nPower is {powers[1]*100:.1f}%', transform=transform, ha='center', va='bottom')
    ax3.text(sample_sizes[2], 1.02, f'At n={sample_sizes[2]}\nPower is {powers[2]*100:.1f}%', transform=transform, ha='center', va='bottom')

    # add a horizontal line at p-val=alpha_t
    ax3.axhline(np.log(alpha_t), color='black', linestyle='--', label=f'{alpha_t:.2f} alpha threshold')

    # add a horizontal line at p-val=1
    ax3.axhline(np.log(1), color='black', linestyle='-', lw=2)

    # add a legend at the bottom right
    ax3.legend(loc='lower left')

    # set limits for the y axis
    ax3.set_xlim(sample_sizes[0]-l*.5, sample_sizes[-1] + l*.5)

    # add a label to the x-axis
    ax3.set_xlabel('Sample size (n)')

    # add a label to the y-axis
    ax3.set_ylabel('log(p-value)')

    plt.subplots_adjust(hspace=-.25)
    plt.tight_layout()
    plt.show()

#plot_beta(1000, .06, 100, 2, 3, 4, 5, 6, 7, 8, 9)

In [29]:
interact(plot_beta,
n_sim=FloatSlider(min=100, max=10000, step=100, value=100, description='Number of simulations', layout=Layout(width='100%'), style={'description_width': 'initial'}),
alpha_t=FloatSlider(min=0.01, max=1, step=0.01, value=0.05, description='Alpha threshold', layout=Layout(width='100%'), style={'description_width': 'initial'}),
ss=FloatSlider(min=50, max=10000, step=10, value=200, description='Sample size', layout=Layout(width='100%')),
a0A=FloatSlider(min=.1, max=20, step=0.1, value=2, description='Controls Model A (Alpha)', layout=Layout(width='100%'), style={'description_width': 'initial'}),
b0A=FloatSlider(min=.1, max=20, step=0.1, value=3, description='Controls Model A (Beta)', layout=Layout(width='100%'), style={'description_width': 'initial'}),
a1A=FloatSlider(min=.1, max=20, step=0.1, value=4, description='Cases Model A (Alpha)', layout=Layout(width='100%'), style={'description_width': 'initial'}),
b1A=FloatSlider(min=.1, max=20, step=0.1, value=5, description='Cases Model A (Beta)', layout=Layout(width='100%'), style={'description_width': 'initial'}),
a0B=FloatSlider(min=.1, max=20, step=0.1, value=1.7, description='Controls Model B (Alpha)', layout=Layout(width='100%'), style={'description_width': 'initial'}),
b0B=FloatSlider(min=.1, max=20, step=0.1, value=3.3, description='Controls Model B (Beta)', layout=Layout(width='100%'), style={'description_width': 'initial'}),
a1B=FloatSlider(min=.1, max=20, step=0.1, value=4.3, description='Cases Model B (Alpha)', layout=Layout(width='100%'), style={'description_width': 'initial'}),
b1B=FloatSlider(min=.1, max=20, step=0.1, value=4.7, description='Cases Model B (Beta)', layout=Layout(width='100%'), style={'description_width': 'initial'})
);

interactive(children=(FloatSlider(value=100.0, description='Number of simulations', layout=Layout(width='100%'…

Contact: François Grolleau; grolleau [ a t ] stanford [ d o t ] edu <br>
© P3D (Power to Detect Differences in Discrimination). All Rights Reserved.