In [None]:
# Load libraries and functions
%load_ext autoreload
%autoreload 2
%matplotlib inline
RANDOM_STATE = 42  # Pseudo-random state

from utils import *
sns.set_palette("tab10") # Default seaborn theme

# Extra libraries for this notebook
import cmprsk
from cmprsk import utils
from cmprsk.cmprsk import cuminc
import scikit_posthocs as sph
from statannot import add_stat_annotation

In [None]:
# Upload dataset
fn_vae_data = glob.glob('./Updated*.pkl')
latest_fn_vae_data = max(fn_vae_data, key=os.path.getctime)

print("Loading... ",latest_fn_vae_data)
with open(latest_fn_vae_data, "rb") as f:
    vae_data_main = pickle.load(f)
print("Done")

# Define functions

In [None]:
######### Posthoc analysis for multiple groups by chi-square test

def get_asterisks_for_pval(p_val):
    """Receives the p-value and returns asterisks string."""
    if p_val > 0.05:
        p_text = "ns"  # above threshold => not significant
    elif p_val < 1e-4:  
        p_text = '****'
    elif p_val < 1e-3:
        p_text = '***'
    elif p_val < 1e-2:
        p_text = '**'
    else:
        p_text = '*'
    
    return p_text

def chisq_and_posthoc_corrected(df): #df is a contingency table
    """Receives a dataframe and performs chi2 test and then post hoc.
    Prints the p-values and corrected p-values (after FDR correction)"""
    # start by running chi2 test on the matrix
    chi2, p, dof, ex = chi(df, correction=True)
    print(f"Chi2 result of the contingency table: {chi2}, p-value: {p}")
    
    # post-hoc
    all_combinations = list(combinations(df.index, 2))  # gathering all combinations for post-hoc chi2
    p_vals = []
    print("Significance results:")
    for comb in all_combinations:
        new_df = df[(df.index == comb[0]) | (df.index == comb[1])]
        chi2, p, dof, ex = chi(new_df, correction=True)
        p_vals.append(p)
        # print(f"For {comb}: {p}")  # uncorrected

    # checking significance
    # correction for multiple testing
    reject_list, corrected_p_vals = multipletests(p_vals, method='fdr_bh')[:2]
    for p_val, corr_p_val, reject, comb in zip(p_vals, corrected_p_vals, reject_list, all_combinations):
        print(f"{comb}: p_value: {p_val:5f}; corrected: {corr_p_val:5f} ({get_asterisks_for_pval(p_val)})")
        
        

# Hospital LOS

In [None]:
# Select outcome data for ICU admissions and individuals
# Group attribution is selected by hierarchy 
df_admissions = vae_data_main[['los', 'day_in_icu_max', 'ID_subid', 'ID', 'outcome_death', 'date', 'group']]
df_admissions = df_admissions.groupby('ID_subid').agg({'los': max, 'day_in_icu_max':max, 'group':max,
                                 'date': min, 'ID':max, 'outcome_death':max,})
df_admissions.date = df_admissions.date.dt.year

df_individuals = df_admissions.copy()
df_individuals = df_individuals.groupby('ID').agg({'los': max, 'day_in_icu_max':max, 'group':max,
                                 'date': min, 'outcome_death':max,})

#Drop Dual HARTI data - not included in the analysis due to small sample size
df_admissions = df_admissions.loc[~(df_admissions.group == "Dual HARTI")]
df_individuals = df_individuals.loc[~(df_individuals.group == "Dual HARTI")]


In [None]:
# Display descriptive data by groups for LOS
df_individuals[['los', 'group']].groupby('group').describe().T

In [None]:
# Compare groups by ANOVA (normal distribution assumption)
lm = smf.ols('los ~ group', data=df_individuals).fit()
anova = sm.stats.anova_lm(lm)
print(anova)

# Compare groups by Kruskal test (non-parametric)
data = [df_individuals.loc[ids, 'los'].values for ids in df_individuals.groupby('group').groups.values()]
H, p = stats.kruskal(*data)
print('\nKruskal test p-value: ', p)

# Compare groups pairwise (non-parametric Conover test)
sph.posthoc_conover(df_individuals, val_col='los', group_col='group', p_adjust ='holm')

### Dynamics of hospital LOS by groups and years

In [None]:
# Calculate numbers for LOS
medians = {}
for group in df_individuals.group.unique():
    m = []
    a = df_individuals[(df_individuals.group==group)]
    for i in range(2011,2021):
        b = a[(a.date == i)].los.median()
        m.append(b)
    medians[group] = m
los = pd.DataFrame.from_dict(medians).T

# test significance of outcome dynamics by years
pvals = []
for col in los.index:
    a = linregress(los.T[col], np.arange(len(los.T[col]))).pvalue
    pvals.append(a)
los = los.assign(pvalues = pvals)
los

In [None]:
# Get p-values
def get_p_suffix(x):
    pval = los.pvalues.dropna().to_dict().get(x, None)
    if pval is not None:
        return f'{x} ($p={pval:.03f}$)'
    return x

data = df_individuals.copy()
data.group = data.group.apply(get_p_suffix)


# Plot boxplots by years and groups
colors_sns = ['medium blue', 'orange', 'light purple', 'light red']
sns.set_palette(sns.xkcd_palette(colors_sns))
fig, ax = plt.subplots(1, figsize=(15, 7))
sns.boxplot(x='date', y='los', hue='group', data=data, ax=ax,
            showfliers=False,
            hue_order=data.group.unique()[[2,3,0,1]],
           )

ax.set_title('Length of hospital stay in 4 groups by years')
ax.set_ylabel('Length of hospital stay, days')
ax.set_xlabel('')
ax.minorticks_on()
ax.grid(linestyle='dotted', which='both', axis='y')

plt.tight_layout()
plt.savefig('pictures/los_years.pdf', bbox_inches="tight", dpi=600)

# ICU LOS

In [None]:
# Display desriptive data by groups for ICU LOS
df_admissions[['day_in_icu_max', 'group']].groupby('group').describe().T

In [None]:
# Compare groups by ANOVA (normal distribution assumption)
lm = smf.ols('day_in_icu_max ~ group', data=df_admissions).fit()
anova = sm.stats.anova_lm(lm)
print(anova)

# Compare groups by Kruskal test (non-parametric)
data = [df_admissions.loc[ids, 'day_in_icu_max'].values for ids in df_admissions.groupby('group').groups.values()]
H, p = stats.kruskal(*data)
print('\nKruskal test p-value: ', p)

# Compare groups pairwise (non-parametric Conover test)
sph.posthoc_conover(df_admissions, val_col='day_in_icu_max', group_col='group', p_adjust ='holm')

### Dynamics of ICU LOS by groups and years

In [None]:
# Calculate numbers for ICU LOS
medians = {}
for group in df_admissions.group.unique():
    m = []
    a = df_admissions[(df_admissions.group==group)]
    for i in range(2011,2021):
        b = a[(a.date == i)].day_in_icu_max.median()
        m.append(b)
    medians[group] = m
losicu = pd.DataFrame.from_dict(medians).T

# test significance of outcome dynamics by years
pvals = []
for col in losicu.index:
    a = linregress(losicu.T[col], np.arange(len(losicu.T[col]))).pvalue
    pvals.append(a)
losicu = losicu.assign(pvalues = pvals)
losicu

In [None]:
# Get p-values
def get_p_suffix(x):
    pval = losicu.pvalues.dropna().to_dict().get(x, None)
    if pval is not None:
        return f'{x} ($p={pval:.03f}$)'
    return x

data = df_admissions.copy()
data.group = data.group.apply(get_p_suffix)

# Plot boxplots by years and groups
colors_sns = ['medium blue', 'orange', 'light purple', 'light red']
sns.set_palette(sns.xkcd_palette(colors_sns))
fig, ax = plt.subplots(1, figsize=(15, 7))
sns.boxplot(x='date', y='day_in_icu_max', hue='group', data=data, ax=ax,
            showfliers=False,
            hue_order=data.group.unique()[[2,3,0,1]],
           )
ax.set_title('Length of ICU stay in 4 groups by years')
ax.set_ylabel('Length of ICU stay, days')
ax.set_xlabel('')
ax.minorticks_on()
ax.grid(linestyle='dotted', which='both', axis='y')

plt.tight_layout()
plt.savefig('pictures/los_icu_years.pdf', bbox_inches="tight", dpi=600)

### Plot hospital LOS, ICU LOS and mortality by groups

In [None]:
# Define comparisons
colors_sns = ['medium blue', 'orange', 'light purple', 'light red']
sns.set_palette(sns.xkcd_palette(colors_sns))

fig, [ax, ax1, ax2] = plt.subplots(1,3, figsize=(17.5, 7))

boxpairs=[('VA-HARTI', 'NVA-HARTI'), ('VA-HARTI', 'Other HAI'), ('VA-HARTI', 'No HAI'),
          ('NVA-HARTI', 'No HAI'), ('NVA-HARTI', 'Other HAI')]
order = ['VA-HARTI', 'NVA-HARTI', 'Other HAI', 'No HAI']

# LOS
sns.boxplot(x='group', y='los', data=df_individuals, ax=ax, showfliers=False, order=order)
# Add p-value annotation
pvals_los_all = sph.posthoc_conover(df_individuals, val_col='los', group_col='group', p_adjust ='holm')
pvalues_los = []
for i in boxpairs:
    pvalues_los.append(pvals_los_all.loc[i])
    
add_stat_annotation(ax=ax, data=df_individuals, x='group', y='los', order=order, box_pairs=boxpairs,
                    perform_stat_test=False, pvalues=pvalues_los,
                    test=None, text_format='star',
                    loc='outside', verbose=0, text_offset=1)
ax.minorticks_on()
ax.grid(linestyle='dotted', which='both', axis='y')
ax.set_xlabel('')
ax.set_xticklabels(['VA-HARTI', 'NVA-HARTI', 'Other HAI', 'No HAI'])
ax.set_ylabel('Length of hospital stay, days')

# ICU LOS
sns.boxplot(x='group', y='day_in_icu_max', data=df_admissions, ax=ax1, showfliers=False, order=order)
# Add p-value annotation
pvals_iculos_all = sph.posthoc_conover(df_admissions, val_col='day_in_icu_max', group_col='group', p_adjust ='holm')
pvalues_iculos = []
for i in boxpairs:
    pvalues_iculos.append(pvals_iculos_all.loc[i])
    
add_stat_annotation(ax=ax1, data=df_admissions, x='group', y='day_in_icu_max', order=order, box_pairs=boxpairs,
                    perform_stat_test=False, pvalues=pvalues_iculos,
                    test=None, text_format='star',
                    loc='outside', verbose=0, text_offset=1)
ax1.minorticks_on()
ax1.grid(linestyle='dotted', which='both', axis='y')
ax1.set_xlabel('')
ax1.set_ylabel('Length of ICU stay, days')

# Mortality rate
sns.pointplot(x='group', y="outcome_death", data=df_individuals, join=False, ax=ax2,
              order=order, capsize=.2)
# Add p-value annotation
add_stat_annotation(ax=ax2, data=df_individuals, x='group', y='outcome_death', order=order,
                    box_pairs=[('No HAI', 'VA-HARTI')],
                    perform_stat_test=False,
                    pvalues= [0.000001],
                    test=None, text_format='star',
                    line_offset_to_box=1.6,
                    loc='outside',
                    verbose=0, text_offset=2
                   )
ax2.minorticks_on()
ax2.grid(linestyle='dotted', which='both', axis='y')
ax2.set_xlabel('')
ax2.set_xticklabels(['VA-HARTI', 'NVA-HARTI', 'Other HAI', 'No HAI'])
ax2.set_ylabel('Crude in-hospital mortality')
plt.tight_layout()
plt.savefig('./pictures/outcomes_all.pdf', dpi=600)

# Mortality

In [None]:
# Print overall mortality
print('All patients mortality rate: ', df_individuals.outcome_death.mean())
cil, cir = ci(df_individuals.outcome_death.sum(), len(df_individuals))
print("All patients mortality 95% CI: ", cil, cir)

In [None]:
# Plot proroption dead with 95% CI
plt.rcParams['ytick.right'] = plt.rcParams['ytick.labelright'] = False
fig, ax = plt.subplots(1, figsize=(7,7))
sns.pointplot(x='date', y="outcome_death", data=df_individuals, ax=ax,
              capsize=.03,
              scale=1,
              errwidth = 1.7,
              markers='o', linestyles='dotted',
              join=True
             )
m = []
for i in range(2011, 2021):
    b = df_individuals[(df_individuals.date == i)]
    val = b.outcome_death.mean()
    m.append(val)
pval = linregress(m, np.arange(len(m))).pvalue
ax.text(0,0.03, 'p-value = '+ "%.4f" % pval, fontsize=14)
ax.legend(['Mortality'], fontsize=14)
ax.minorticks_on()
ax.grid(linestyle='dotted', which='both', axis='y')
ax.tick_params(axis='y', which='both', right=False, left=True)
ax.set_title('Mortality by years, full study population')
ax.set_ylabel('Crude in-hospital mortality', fontsize=12)
ax.set_xlabel('')
ax.set_ylim(0,0.28)
print(m)

plt.tight_layout()
plt.savefig('./pictures/outcome_mortality_summary.pdf', dpi=600)

In [None]:
# Describe mortality by groups

mortality = {}
for group in df_individuals.group.unique():
    mortality[group] = {}
    a = df_individuals[(df_individuals.group==group)]
    mortality[group]['n'] = a.outcome_death.sum()
    mortality[group]['mortality'] = a.outcome_death.mean()
    cil, cir = ci(a.outcome_death.sum(), len(a))
    mortality[group]['cil'] = cil
    mortality[group]['cir'] = cir
mortality = pd.DataFrame.from_dict(mortality)
mortality

In [None]:
# test difference in groups
df_individuals.reset_index(level=0, inplace=True)
contigency= pd.crosstab(df_individuals[['ID', 'group']].groupby('ID').max()['group'],
                            df_individuals[['ID', 'outcome_death']].groupby('ID').max()['outcome_death'])

# Compare mortality in groups by chi-sq test. Pairwise comparison
chisq_and_posthoc_corrected(contigency)

### Dynamics of mortality by groups and years

In [None]:
 # Calculate numbers for mortality by years
medians = {}
for group in df_individuals.group.unique():
    m = []
    a = df_individuals[(df_individuals.group==group)]
    for i in range(2011,2021):
        b = a[(a.date == i)]
        val = b.outcome_death.sum() / len(b)
        m.append(val)
    medians[group] = m
mortality_years = pd.DataFrame.from_dict(medians).T

# test significance of outcome dynamics by years
pvals = []
for col in mortality_years.index:
    a = linregress(mortality_years.T[col], np.arange(len(mortality_years.T[col]))).pvalue
    pvals.append(a)
mortality_years = mortality_years.assign(pvalues = pvals)
mortality_years

In [None]:
# Define data; add p-value to legend items
def get_p_suffix(x, g_dict=None):
    pval = mortality_years.pvalues.dropna().to_dict().get(x, None)
    if pval is not None:
        return f'{x} ($p={pval:.03f}$)'
    return x


if not 'No HAI' in mortality_years.index:
    mortality_years.index = mortality_years.index.map({v: k for k, v in groups_dict.items()})

data = df_individuals.copy()
data.group = data.group.apply(get_p_suffix)

# Plot proroption dead with 95% CI
fig, ax = plt.subplots(1, figsize=(15,7))
sns.pointplot(x='date', y="outcome_death", data=data, ax=ax,
              hue='group',
              hue_order=data.group.unique()[[2,3,0,1]],
              dodge=0.3,
              capsize=.03,
              scale=1.3,
              errwidth = 1.7,
              join=False
             )

ax.legend(fontsize=14)
ax.minorticks_on()
ax.grid(linestyle='dotted', which='both', axis='y')
ax.set_xlabel('')
ax.set_ylabel('Crude in-hospital mortality')

plt.tight_layout()
plt.savefig('./pictures/outcome_mortality.pdf', dpi=600)

________