This notebook allows to reproduce the paper synthetic results. First, we create a synthetic population with different disease expression. Then, we enforce missingness following three scenario of clinical presence, i.e. the interaction between patient and the healthcare system:
- (Mis)-informed collection
- Limited access to quality care
- Confirmation bias

In [None]:
from utils import *
import numpy as np

In [None]:
# Number of random repetitions
k = 100

# Data distribution
points = 100000 # Number points for the majority
ratio = 0.01 # Proportion of minority
class_balance = 0.66 # Class balance for positive and negatives (for paper, if generate_data_linear_shift then 0.66) [0.1, 0.5]

generate = generate_data_linear_shift # How to generate the data: generate_data_linear_shift or generate_data_same or generate_data_linear_corr_shift
frontier = 0.5 # Where to put the threshold on missingness for S1 and S2 (for paper, if generate_data_linear_shift then 0.5)


### Generate data

First, we create the data from three gaussian: one for positives and two for negatives (one for the minority and one for the majority). This same function is then called at each $k$ iteration.

In [None]:
data, labels, protected_binarized, protected = generate(majority_size = points, ratio = ratio)
display_data(data, labels, protected, distribution = True, legend = True)

In [None]:
# Imputations strategies to explore
imputations = {
                'Population Mean': {'strategy': 'Mean'},
                'Group Mean': {'strategy': 'Group Mean'},
                
                'MICE': {'strategy': 'MICE'},
                'Group MICE': {'strategy': 'Group MICE'},
                'Group MICE Missing': {'strategy': 'Group MICE', 'add_missing': True}, 
              }
alphas = [0.35, 1, 0.35, 1, 1]

### Limited access to quality care

Care is more limited in the marginalised group. Missingness is therefore concentrated in this group.

In [None]:
def limited_access(data, labels, protected, seed = 42):
    p = (protected == "Minority").astype(float) # All minority
    total = p.sum()
    selection = data.sample(int(total * 0.5), replace = False, weights = p / total, random_state = seed).index # 50 % missing
    missing = data.copy()
    missing.loc[selection, 0] = np.nan
    return missing

In [None]:
# Variable for results
performance_lim, reconstruction_lim = {}, {}

for name, args in imputations.items():
    print("Computing: ", name)
    ## Modelling
    performance_lim[name], coefs, (reconstruction_lim[name], mean_observed_lim, obs_rate_lim, corr_lim, corr_cov), imputed = k_experiment(majority_size = points, ratio = ratio, class_balance = class_balance, 
            generate = generate, removal = limited_access, k = k, n_imputation = 10 if 'MICE' in name else 1, **args)
    ## Display
    # data, imputed, labels, protected_binarized, protected = imputed
    # display_data(imputed.Mean, labels, protected, distribution = True, legend = False)
    # plt.scatter([], [], alpha = 0, label = ' ')
    # plt.axline((0, coefs[0]), slope = coefs[1], c = 'black', ls = '-.', label = 'Decision boundary')
    # if name == 'Group MICE':
    #     plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    # plt.show()

In [None]:
display_result(performance_lim, alphas = alphas)

In [None]:
display_result(performance_lim, 'Brier Score', legend = False, alphas = alphas)

### Mis-informed collection

Missingness is informed by the standard guidelines. We propose that the first dimension is observed only if the second is in a given range.

In [None]:
def misinformed(data, labels, groups, seed = 42): # Must respect this signature
    p = (data.iloc[:, 1] > frontier).astype(float) # All above threshold
    total = p.sum()
    selection = data.sample(int(total * 0.5), replace = False, weights = p / total, random_state = seed).index
    missing = data.copy()
    missing.loc[selection, 0] = np.nan
    return missing

In [None]:
# Variable for results
performance_mis, reconstruction_mis = {}, {}

for name, args in imputations.items():
    print("Computing: ", name)
    ## Modelling
    performance_mis[name], coefs, (reconstruction_mis[name], mean_observed_mis, obs_rate_mis, corr_mis, corr_cov), imputed= k_experiment(majority_size = points, ratio = ratio, class_balance = class_balance, 
            generate = generate, removal = misinformed, k = k, n_imputation = 10 if 'MICE' in name else 1, **args)

    ## Display
    # data, imputed, labels, protected_binarized, protected = imputed
    # display_data(imputed.Mean, labels, protected, distribution = True, legend = False)
    # plt.scatter([], [], alpha = 0, label = ' ')
    # plt.axline((0, coefs[0]), slope = coefs[1], c = 'black', ls = '-.', label = 'Decision boundary')
    # if name == 'Group MICE':
    #     plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    # plt.show()

In [None]:
display_result(performance_mis, alphas = alphas)

In [None]:
display_result(performance_mis, 'Brier Score', legend = False, alphas = alphas)

### Confirmation bias

Test is performed when the outcome is expected.

In [None]:
def confirmation(data, labels, protected, seed = 42):
    p = (data.iloc[:, 0] > frontier).astype(float) # All negatives
    total = p.sum()
    selection = data.sample(int(total * 0.5), replace = False, weights = p / total, random_state = seed).index # 50 % missing
    missing = data.copy()
    missing.loc[selection, 0] = np.nan
    return missing

In [None]:
# Variable for results
performance_conf, reconstruction_conf = {}, {}

for name, args in imputations.items():
    print("Computing: ", name)
    ## Modelling
    performance_conf[name], coefs, (reconstruction_conf[name], mean_observed_conf, obs_rate_conf, corr_conf, corr_cov), imputed = k_experiment(majority_size = points, ratio = ratio, class_balance = class_balance, 
            generate = generate, removal = confirmation, k = k, n_imputation = 10 if 'MICE' in name else 1, **args)

    ## Display
    # data, imputed, labels, protected_binarized, protected = imputed
    # display_data(imputed.Mean, labels, protected, distribution = True, legend = False)
    # plt.scatter([], [], alpha = 0, label = ' ')
    # plt.axline((0, coefs[0]), slope = coefs[1], c = 'black', ls = '-.', label = 'Decision boundary')
    # if name == 'Group MICE':
    #     plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    # plt.show()

In [None]:
display_result(performance_conf, alphas = alphas)

In [None]:
display_result(performance_conf, 'Brier Score', alphas = alphas)

----------

# Comparison minority groups

This following functions allow to reproduce the table and plots presented in the paper.

In [None]:
performances_minority = {m:
    pd.concat({
        "Confirmation \n bias (S3)": performance_conf[m]['Minority'],
        "(Mis)-Informed \n collection (S2)": performance_mis[m]['Minority'],
        "Limited access \n to quality care (S1)": performance_lim[m]['Minority'],
    }, axis = 1)
for m in performance_lim}

performances_majority = {m:
    pd.concat({
        "Confirmation \n bias (S3)": performance_conf[m]['Majority'],
        "(Mis)-Informed \n collection (S2)": performance_mis[m]['Majority'],
        "Limited access \n to quality care (S1)": performance_lim[m]['Majority'],
    }, axis = 1)
for m in performance_lim}

performances_overall = {m:
    pd.concat({
        "Confirmation \n bias (S3)": performance_conf[m]['Overall'],
        "(Mis)-Informed \n collection (S2)": performance_mis[m]['Overall'],
        "Limited access \n to quality care (S1)": performance_lim[m]['Overall'],
    }, axis = 1)
for m in performance_lim}

In [None]:
metric = 'AUC'

difference = {
    imput: pd.concat({'Mean': (performances_minority[imput][performances_minority[imput].index.get_level_values(1) == metric] - performances_majority[imput][performances_minority[imput].index.get_level_values(1) == metric]).mean(),
            'Std': (performances_minority[imput][performances_minority[imput].index.get_level_values(1) == metric] - performances_majority[imput][performances_minority[imput].index.get_level_values(1) == metric]).std()}, axis = 1)
    for imput in performances_overall
}

difference = pd.concat(difference, axis = 1)
difference = difference.swaplevel(0, axis = 1)
print_pandas_latex(difference['Mean'], difference['Std'])

In [None]:
means = pd.concat({
            "Confirmation \n bias (S3)": mean_observed_conf,
            "(Mis)-Informed \n collection (S2)": mean_observed_mis,
            "Limited access \n to quality care (S1)": mean_observed_lim,
        }, axis = 1)

obs = pd.concat({
            "Confirmation \n bias (S3)": obs_rate_conf,
            "(Mis)-Informed \n collection (S2)": obs_rate_mis,
            "Limited access \n to quality care (S1)": obs_rate_lim,
        }, axis = 1)

corr = pd.concat({
            "Confirmation \n bias (S3)": corr_conf,
            "(Mis)-Informed \n collection (S2)": corr_mis,
            "Limited access \n to quality care (S1)": corr_lim,
        }, axis = 1)

error = {imputation: pd.concat({
            "Confirmation \n bias (S3)": reconstruction_conf[imputation],
            "(Mis)-Informed \n collection (S2)": reconstruction_mis[imputation],
            "Limited access \n to quality care (S1)": reconstruction_lim[imputation],
        }, axis = 1)
        for imputation in imputations}
error = pd.concat(error)
error = error.swaplevel(0, 1, axis = 0).T

In [None]:
print_pandas_latex(error['Mean'], error['Std'])

In [None]:
print_pandas_latex(means.loc['Mean'].T, means.loc['Std'].T)

In [None]:
print_pandas_latex(obs.loc['Mean'].T, obs.loc['Std'].T)

In [None]:
print_pandas_latex(corr.loc['Mean'].T, corr.loc['Std'].T)

In [None]:
# Display delta performance
difference_error = error.loc[:, error.columns.get_level_values(2) == 'Difference'].droplevel(2, 'columns')
ax1 = difference_error.Mean.plot.barh(xerr = 1.96 * difference_error.Std / np.sqrt(k), width = 0.7, figsize = (6.4, 4.8))
hatches = ['', 'ooo', 'xx', '//', '||', '***', '++'] * 2
for i, thisbar in enumerate(ax1.patches):
    c = list(plt_colors.to_rgba('tab:blue'))
    c[3] = 0.35 if i // len(difference_error) in [0,2] else 1
    thisbar.set(edgecolor = '#eaeaf2', facecolor = c, linewidth = 1, hatch = hatches[i // len(difference_error)])

patches = [ax1.patches[i * len(difference_error)] for i in range(len(difference_error.Mean.columns))][::-1]
labels = difference_error.Mean.columns.tolist()[::-1]

ax1.legend([], [], framealpha = 0)
plt.axvline(0, ls = '--', alpha = 0.5, c = 'k')
plt.xlabel('$\Delta$ Reconstruction')
plt.show()

In [None]:
ax, last = None, 0
patches += ax1.plot(np.NaN, np.NaN, '-', color='none')
labels += [' ']
for (color, group, name) in [('tab:orange', 'Majority', 'Majority'), ('tab:blue', 'Minority', 'Marginalised'), ('tab:gray', 'Overall', 'Overall')]:
    mean = error.loc[:, error.columns.get_level_values(2) == group].droplevel(2, 'columns')
    ax = mean.Mean.plot.barh(ax = ax, legend = False, xerr = 1.96 * difference.Std / np.sqrt(100), width = 0.7, ecolor = color, error_kw = {"alpha": 0.25, 'elinewidth': 3}, figsize = (3.2, 4.8))

    # Remove bar and replace with dot
    for i, thisbar in enumerate(ax.patches):
        if i >= last:
            thisbar.set(alpha = 0)
            dot = ax.scatter(thisbar.get_width(), thisbar.get_y() + thisbar.get_height() / 2, alpha = 0.7,
                          marker = ('|' if name != 'Overall' else 'x'), s = 125, color = color, linewidths=3)
            last += 1

    patches += [dot]
    labels += [name]

ax.set_yticklabels([' ', ' ', ' '])
ax.legend(patches, labels, loc='upper left', bbox_to_anchor=(1.3, 1.04), frameon = False,
        handletextpad = 0.5, handlelength = 1.0, columnspacing = -0.5,)
ax.set_xlabel('Reconstruction Error')

In [None]:
ax1 = difference.Mean.plot.barh(xerr = 1.96 * difference.Std / np.sqrt(k), width = 0.7, figsize = (6.4, 4.8))
hatches = ['', 'ooo', 'xx', '//', '||', '***', '++'] * 2
for i, thisbar in enumerate(ax1.patches):
    c = list(plt_colors.to_rgba('tab:blue'))
    c[3] = 0.35 if i // len(difference) in [0,2] else 1
    thisbar.set(edgecolor = '#eaeaf2', facecolor = c, linewidth = 1, hatch = hatches[i // len(difference)])

# Destroy legend but keep for next
patches = [ax1.patches[i * len(difference)] for i in range(len(difference.Mean.columns))][::-1]
labels = difference.Mean.columns.tolist()[::-1]
ax1.legend([], [], framealpha = 0)
plt.xlim(-0.6, 0.02)
plt.axvline(0, ls = '--', alpha = 0.5, c = 'k')
plt.xlabel('$\Delta$ AUC')
plt.show()

In [None]:
ax, last = None, 0
patches += ax1.plot(np.NaN, np.NaN, '-', color='none')
labels += [' ']
for (group, color, name) in [(performances_majority, 'tab:orange', 'Majority'), (performances_minority, 'tab:blue', 'Marginalised'), (performances_overall, 'tab:gray', 'Overall')]:
    mean = {
        imput: pd.concat({'Mean': group[imput][group[imput].index.get_level_values(1) == metric].mean(),
            'Std': group[imput][group[imput].index.get_level_values(1) == metric].std()}, axis = 1)
        for imput in performances_overall
    }
    mean = pd.concat(mean, axis = 1).swaplevel(0, axis = 1)
    ax = mean.Mean.plot.barh(ax = ax, legend = False, xerr = 1.96 * difference.Std / np.sqrt(100), width = 0.7, ecolor = color, error_kw = {"alpha": 0.25, 'elinewidth': 3}, figsize = (3.2, 4.8))

    # Remove bar and replace with dot
    for i, thisbar in enumerate(ax.patches):
        if i >= last:
            thisbar.set(alpha = 0)
            dot = ax.scatter(thisbar.get_width(), thisbar.get_y() + thisbar.get_height() / 2, alpha = 0.7,
                          marker = ('|' if name != 'Overall' else 'x'), s = 125, color = color, linewidths=3)
            last += 1

    patches += [dot]
    labels += [name]

ax.set_yticklabels([' ', ' ', ' '])
ax.set_xlim(0.35, 1.05)
ax.legend(patches, labels, loc='upper left', bbox_to_anchor=(1.3, 1.04), frameon=False,
        handletextpad = 0.5, handlelength = 1.0, columnspacing = -0.5,)
ax.set_xlabel('Group-specific AUC')
