In [None]:
# Load libraries and functions
%load_ext autoreload
%autoreload 2
%matplotlib inline
RANDOM_STATE = 42  # Pseudo-random state

from utils import *
sns.set_palette("tab10") # Default seaborn theme

# Extra libraries for this notebook
import cmprsk
from cmprsk import utils
from cmprsk.cmprsk import cuminc
import scikit_posthocs as sph

In [None]:
# Upload dataset
fn_vae_data = glob.glob('./Updated*.pkl')
latest_fn_vae_data = max(fn_vae_data, key=os.path.getctime)

print("Loading... ",latest_fn_vae_data)
with open(latest_fn_vae_data, "rb") as f:
    vae_data_main = pickle.load(f)
print("Done")

# Time of death

In [None]:
# data for day of death
T = vae_data_main[['ID', 'outcome_death', 'group', 'los']].groupby('ID').max()
T = T[T.outcome_death == 1]

# Plot death time distribution
plt.figure(figsize=(10,5))
seaborn_grid = sns.displot(T, x="los", hue="group", element="step")
seaborn_grid.fig.set_figwidth(7)
plt.grid(linestyle='dotted')
plt.ylabel("# of deaths")
plt.xlabel('Day in the ICU')
plt.tight_layout()
plt.savefig('./pictures/deathtime_hist.pdf', dpi=600)

In [None]:
#Show median death time per group
T[['los', 'group']].groupby('group').describe().T

# Beat immortal-time bias with Varying-time Cox model

### Time Cox for NVA-HARTI

In [None]:
# Create base df with death
base_df = vae_data_main[['ID', 'day_in_icu_max', 'outcome_death',
                         'age', 'gender_M', 'disease_type_trauma', 'disease_type_tumor',
                          'disease_type_vascular', 'charlson', 'group'
                        ]].groupby('ID').max()
base_df = base_df.loc[base_df.group.isin(['No HAI', 'NVA-HARTI', 'Dual HARTI']), :]
base_df = base_df.drop(['group'], axis=1)
base_df = base_df.reset_index()
base_df.columns = ['id', 'duration', 'event', 'age', 'gender_M', 'disease_type_trauma', 'disease_type_tumor',
                          'disease_type_vascular', 'charlson']
base_df = to_long_format(base_df, duration_col="duration")

# Create HARTI df
event_df = vae_data_main[['ID', 'day_in_icu_bid', 'group']].groupby('ID').max()
event_df = event_df.loc[event_df.group.isin(['NVA-HARTI', 'Dual HARTI']), :]
event_df = event_df.drop(['group'], axis=1)
event_df = event_df.reset_index()
event_df.columns = ['id', 'nva-harti']

# Add covariates
cv = covariates_from_event_matrix(event_df, id_col="id")

# Combine base and harti dfs
base_df = add_covariate_to_timeline(base_df, cv, duration_col="duration", id_col="id", event_col="event")
base_df = base_df.fillna(0)

In [None]:
# Fit time varying Cox model
ctv = CoxTimeVaryingFitter(penalizer=0.1)
ctv.fit(base_df, id_col="id", event_col="event", start_col="start", stop_col="stop", show_progress=True)
ctv.print_summary()
ctv.plot()

### Time Cox for VA-HARTI

In [None]:
# Create base df with death
base_df = vae_data_main[['ID', 'day_in_icu_max', 'outcome_death',
                         'age', 'gender_M', 'disease_type_trauma', 'disease_type_tumor',
                          'disease_type_vascular', 'charlson', 'group'
                        ]].groupby('ID').max()
base_df = base_df.loc[base_df.group.isin(['No HAI', 'VA-HARTI', 'Dual HARTI']), :]
base_df = base_df.drop(['group'], axis=1)
base_df = base_df.reset_index()
base_df.columns = ['id', 'duration', 'event', 'age', 'gender_M', 'disease_type_trauma', 'disease_type_tumor',
                          'disease_type_vascular', 'charlson']
base_df = to_long_format(base_df, duration_col="duration")

# Create HARTI df
event_df = vae_data_main[['ID', 'day_in_icu_bid', 'group']].groupby('ID').max()
event_df = event_df.loc[event_df.group.isin(['VA-HARTI', 'Dual HARTI']), :]
event_df = event_df.drop(['group'], axis=1)
event_df = event_df.reset_index()
event_df.columns = ['id', 'vap']

# Add covariates
cv = covariates_from_event_matrix(event_df, id_col="id")

# Combine base and harti dfs
base_df = add_covariate_to_timeline(base_df, cv, duration_col="duration", id_col="id", event_col="event")
base_df = base_df.fillna(0)

In [None]:
# Fit time varying Cox model
ctv = CoxTimeVaryingFitter(penalizer=0.1)
ctv.fit(base_df, id_col="id", event_col="event", start_col="start", stop_col="stop", show_progress=True)
ctv.print_summary()
ctv.plot()

# Survival plot. Competing risk model

In [None]:
# Select outcome data for ICU admissions and individuals
# Group attribution is selected by hierarchy 
df_admissions = vae_data_main[['los', 'day_in_icu_max', 'ID_subid', 'ID', 'outcome_death', 'date', 'group']]
df_admissions = df_admissions.groupby('ID_subid').agg({'los': max, 'day_in_icu_max':max, 'group':max,
                                 'date': min, 'ID':max, 'outcome_death':max,})
df_admissions.date = df_admissions.date.dt.year

df_individuals = df_admissions.copy()
df_individuals = df_individuals.groupby('ID').agg({'los': max, 'day_in_icu_max':max, 'group':max,
                                 'date': min, 'outcome_death':max,})

#Drop Dual HARTI data - not included in the analysis due to small sample size
df_admissions = df_admissions.loc[~(df_admissions.group == "Dual HARTI")]
df_individuals = df_individuals.loc[~(df_individuals.group == "Dual HARTI")]

# Select IDs for all groups
group_ids = {}
for group in df_individuals.group.unique():
    group_ids[group] = df_individuals[(df_individuals.group == group)].index.unique().tolist()


In [None]:
# Define data
LIM=228
T = vae_data_main[['ID', 'los']].groupby('ID').max()
E = vae_data_main[['ID', 'outcome_death']].groupby('ID').max()
T_vap = T.loc[T.index.isin(group_ids['VA-HARTI'])]
T_nva = T.loc[T.index.isin(group_ids['NVA-HARTI'])]
T_nohai = T.loc[T.index.isin(group_ids['No HAI'])]
E = E.replace(0, 2)
E_vap = E.loc[E.index.isin(group_ids['VA-HARTI'])]
E_nva = E.loc[E.index.isin(group_ids['NVA-HARTI'])]
E_nohai = E.loc[E.index.isin(group_ids['No HAI'])]

# Plot competing event function
colors_sns = ['medium blue', 'orange', 'light red']
sns.set_palette(sns.xkcd_palette(colors_sns))
fig, ax = plt.subplots(1, 1, figsize=(7,6))
cuminc_res_vap = cuminc(T_vap.values.reshape(-1), E_vap.values.reshape(-1).astype('float64'))
for name, group in list(cuminc_res_vap.groups.items())[:1]:
    ax.plot(group.time, 1-group.est, label="a")
    ax.fill_between(group.time, 1-group.low_ci, 1-group.high_ci, alpha=0.2)
    
cuminc_res_nva = cuminc(T_nva.values.reshape(-1), E_nva.values.reshape(-1).astype('float64'))
for name, group in list(cuminc_res_nva.groups.items())[:1]:
    ax.plot(group.time, 1-group.est, label="f")
    ax.fill_between(group.time, 1-group.low_ci, 1-group.high_ci, alpha=0.2)

cuminc_res_nohai = cuminc(T_nohai.values.reshape(-1), E_nohai.values.reshape(-1).astype('float64'))
for name, group in list(cuminc_res_nohai.groups.items())[:1]:
    ax.plot(group.time, 1-group.est, label="f")
    ax.fill_between(group.time, 1-group.low_ci, 1-group.high_ci, alpha=0.2)
   
ax.set_xlim(0, LIM, 1)
ax.grid(linestyle='dotted', which='both', axis='both')
ax.minorticks_on()
ax.legend(['VA-HARTI', 'NVA-HARTI', 'No HAI'])
ax.set_ylabel('Proportion alive')
ax.set_xlabel('Day in the hospital')
ax.set_title('Survival probability, discharge is treated as a competing event')
plt.tight_layout()
plt.savefig('./pictures/survival_cmprsk_va-nohai.pdf', dpi=600)

In [None]:
# Calculate survival difference at fixed point in time
tpoint = 52
results = survival_difference_at_fixed_point_in_time_test(tpoint, T.loc[T.index.isin(group_ids['No HAI'])],
                                                          T.loc[T.index.isin(group_ids['VA-HARTI'])],
                                                          event_observed_A=E.loc[E.index.isin(group_ids['No HAI'])],
                                                          event_observed_B=E.loc[E.index.isin(group_ids['VA-HARTI'])])
results.print_summary()

# Logistic regression for mortality
### VA-HARTI & Mortality

In [None]:
##### Select all related factors by data type
# Numerical factors
FACTORS_numeric = ['age', 'charlson',  # condition on admission
           
                   'st_all_sum', 'st_craniotomy_len_sum', # surgeries
                   'st_device_len_sum', 'st_endonasal_len_sum', 'st_endovascular_len_sum',
                   'st_other_len_sum', 'st_spinal_len_sum', 'st_all_len_sum',
                   
                   'gcs',   # severity of patients condition
           
                   'mech_vent_days',   'antibiotics_total_binary_days',  # ICU care
                   'central_line_days', 'feeding_tube_days', 'arterial_line_days', 'evd_days', 'icpm_days',
                   'urinary_catheter_days', 'hypothermia_days', 'hemodialysis_days',
                   'total_parenteral_feeding_days', 'sedation_days', 'anxiolytics_days',
                   'vasopressors_days',
                   'endotracheal_tube_1_days', 'endotracheal_tube_2_days', 'endotracheal_tube_3_days',
                   
                   'intestinal_dysfunction_days', 'infection_bloodstream_days', # complications
                   'infection_other_days', 'infection_urinary_days', 'infection_cns_days', 'infection_ssi_days'
                  ]

# Binary factors
FACTORS_binary = ['gender_M', 'disease_type_trauma', 'disease_type_tumor', # condition on admission
                  'disease_type_vascular', 'disease_type_other',
                  
                  'convulsions', 'aphasia', 'vegetative_state',  # severity of patients condition
                  
                  'st_device_count', 'st_other_count', 'st_craniotomy_count', 'st_endovascular_count', # surgeries
                  'st_endonasal_count', 'st_spinal_count',
                  
                  'csfl_ne', 'csfl_ss'
                 ]


In [None]:
# Replace NaNs in len_ cols by 0
len_cols = ['st_craniotomy_len_sum', 'st_device_len_sum', 'st_endonasal_len_sum',
            'st_endovascular_len_sum', 'st_other_len_sum', 'st_spinal_len_sum',
            'st_all_len_sum', 'st_craniotomy_len_sum_bid', 'st_device_len_sum_bid',
            'st_endonasal_len_sum_bid', 'st_endovascular_len_sum_bid', 'st_other_len_sum_bid',
            'st_spinal_len_sum_bid', 'st_all_len_sum_bid'
           ]

for col in len_cols:
    vae_data_main[col].replace(np.nan, 0, inplace=True)

In [None]:
######## Step #1: Univariate analysis for VA-HARTI

# Define data
data_vap = vae_data_main[(vae_data_main.vap == 1)].groupby('ID_subid').max()
data_novap = vae_data_main[(vae_data_main.vap != 1)].groupby('ID_subid').max()

# Calculate p-values for continuous and binary variables
pvals = {}
for factor in FACTORS_numeric:
    pvals[factor] = stats.ttest_ind(data_vap[factor].dropna().values, data_novap[factor].dropna().values,
                                    equal_var = False).pvalue
pvalues = pd.DataFrame({'pvalue': pvals})
    
pvals = {}
for factor in FACTORS_binary:
    contigency = pd.crosstab(vae_data_main[['ID_subid', 'vap']].groupby('ID_subid').max()['vap'],
                            vae_data_main[['ID_subid', factor]].groupby('ID_subid').max()[factor]>0)
    pvals[factor] = chi(contigency)[1]
    
pvalues = pd.concat([pvalues, pd.DataFrame({'pvalue': pvals})], axis=0)

# Adjust p-values for multiple comparison
pvalues['adjusted_pvalue'] = pd.Series(multipletests(pvalues.pvalue.dropna().values)[1],
                                       index=pvalues.pvalue.dropna().index)
pvalues['adjusted_pvalue'] = pvalues['adjusted_pvalue'].apply(lambda x: round(x, 5))
print("Number of factors, step #1: ", len(pvalues))

In [None]:
# Step #2: Run first logreg model to find VAP non-predictors
# Include only columns that showed significance in univariate analysis

COLS = pvalues[(pvalues.adjusted_pvalue < 0.05)].index
COLS = COLS.tolist() + ['ID_subid']
print("Number of factors included in first logreg model: ", len(COLS))

# Define data
logreg_data = vae_data_main[COLS].groupby('ID_subid').max()
y = vae_data_main[['ID_subid', 'vap']].groupby('ID_subid').max().astype(float)
X = sm.add_constant(logreg_data.astype(float))

# Test predictors for VA-HARTI
logit_model = sm.Logit(y, X)
result = logit_model.fit_regularized()
p = result.summary2().tables[1]['P>|z|']
vap_non_predictors = p[p >= 0.05].index.tolist() + ['vap'] + ['ID_subid']
print("Number of VAP non-predictors: ", len(vap_non_predictors))

In [None]:
###### Step #3: use VAP non-predictors to fit logreg to select independent risk factors for mortality

# Define x and y
vap_non_predictors.remove('const') if 'const' in vap_non_predictors else vap_non_predictors
logreg_data = vae_data_main[vap_non_predictors].groupby('ID_subid').max()
y = vae_data_main[['ID_subid', 'outcome_death']].groupby('ID_subid').max()
X = sm.add_constant(logreg_data)

# fit model
logit_model=sm.Logit(y.astype(float), X.astype(float))
result=logit_model.fit_regularized()
print(result.summary2())

In [None]:
odds_vap = result.summary2().tables[1][['Coef.', '[0.025', '0.975]', 'P>|z|']].apply(
    {'Coef.': np.exp,
     '[0.025': np.exp,
     '0.975]': np.exp,
     'P>|z|': lambda x: f'{x:.04f}'})
odds_vap.columns = ('Odds ratio', 'Lower', 'Upper', 'p_value')

# Match column names from json dict
f = open("./columns_dict.json")
columns_dict = json.load(f)
odds_vap.index = odds_vap.index.to_series().map(columns_dict.get)

# Save table
odds_vap.to_csv('./odds_vap.csv', sep='\t', encoding='utf-8')

odds_vap

In [None]:
# Show only significant covariates
odds_vap.loc[(odds_vap.p_value.astype(float) <= 0.05)]

### Test performance of the mortality logreg model; VA-HARTI

In [None]:
# Split into train and test subsets
scaler = StandardScaler()
scaler.fit(logreg_data)

x_train, x_test, y_train, y_test = \
    train_test_split(logreg_data, 
                     y, test_size=0.2, random_state=0)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# Fit logreg model
model = LogisticRegression(solver='lbfgs', C=10, random_state=0, class_weight='balanced')
model.fit(x_train, y_train.values.reshape(-1))

print("Accuracy on train set; mortality model: ", model.score(x_train, y_train))
print("Accuracy of test set; mortality model: ", model.score(x_test, y_test))
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)
print("F1 score; VAP~mortality model: ", f1_score(y_test, y_pred))
print('ROC-AUC score, VAP~mortality model: ', roc_auc_score(y_test, y_pred))

In [None]:
# Show full report
print(classification_report(y_test, y_pred))

# Print coefficients
thetas = {v: k for k,v in zip(model.coef_.reshape(-1), logreg_data.columns.values.reshape(-1))}
thetas

## NVA-HARTI & Mortality

In [None]:
######## Step #1: Univariate analysis for NVA-HARTI

# Define data
data_nva = vae_data_main[(vae_data_main.non_vap_resp_hai == 1)].groupby('ID_subid').max()
data_nonva = vae_data_main[(vae_data_main.non_vap_resp_hai != 1)].groupby('ID_subid').max()

# Calculate p-values for continuous and binary variables
pvals = {}
for factor in FACTORS_numeric:
    pvals[factor] = stats.ttest_ind(data_nva[factor].values, data_nonva[factor].values,
                          equal_var = False).pvalue
pvalues = pd.DataFrame({'pvalue': pvals})
    
pvals = {}
for factor in FACTORS_binary:
    contigency = pd.crosstab(vae_data_main[['ID_subid', 'non_vap_resp_hai']].groupby('ID_subid').max()['non_vap_resp_hai'],
                            vae_data_main[['ID_subid', factor]].groupby('ID_subid').max()[factor]>0)
    pvals[factor] = chi(contigency)[1]
    
pvalues = pd.concat([pvalues, pd.DataFrame({'pvalue': pvals})], axis=0)

# Adjust p-values for multiple comparison
pvalues['adjusted_pvalue'] = pd.Series(multipletests(pvalues.pvalue.dropna().values)[1],
                                       index=pvalues.pvalue.dropna().index)
pvalues['adjusted_pvalue'] = pvalues['adjusted_pvalue'].apply(lambda x: round(x, 5))
print("Number of factors, step #1: ", len(pvalues))


In [None]:
# Step #2: Run first logreg model to find VAP non-predictors
# Include only columns that showed significance in univariate analysis

COLS = pvalues[(pvalues.adjusted_pvalue < 0.05)].index
COLS = COLS.tolist() + ['ID_subid']
print("Number of factors included in first logreg model: ", len(COLS))

# Define data
logreg_data = vae_data_main[COLS].groupby('ID_subid').max()
y = vae_data_main[['ID_subid', 'vap']].groupby('ID_subid').max().astype(float)
X = sm.add_constant(logreg_data.astype(float))

# Test predictors for NVA-HARTI
logit_model = sm.Logit(y, X)
result = logit_model.fit_regularized()
p = result.summary2().tables[1]['P>|z|']
nva_non_predictors = p[p >= 0.05].index.tolist() + ['non_vap_resp_hai'] + ['ID_subid']
print("Number of VAP non-predictors: ", len(nva_non_predictors))

In [None]:
#### Step #3: use VAP non-predictors to fit logreg to select independent risk factors for mortality

# Define x and y
nva_non_predictors.remove('const') if 'const' in nva_non_predictors else nva_non_predictors
logreg_data = vae_data_main[nva_non_predictors].groupby('ID_subid').max()
y = vae_data_main[['ID_subid', 'outcome_death']].groupby('ID_subid').max()
X = sm.add_constant(logreg_data)

# fit model
logit_model=sm.Logit(y.astype(float), X.astype(float))
result=logit_model.fit_regularized()
print(result.summary2())

In [None]:
odds_nva = result.summary2().tables[1][['Coef.', '[0.025', '0.975]', 'P>|z|']].apply(
    {'Coef.': np.exp,
     '[0.025': np.exp,
     '0.975]': np.exp,
     'P>|z|': lambda x: f'{x:.04f}'})
odds_nva.columns = ('Odds ratio', 'Lower', 'Upper', 'p_value')
odds_nva

# Match column names from json dict
f = open("./columns_dict.json")
columns_dict = json.load(f)
odds_nva.index = odds_nva.index.to_series().map(columns_dict.get)

# Save table
odds_nva.to_csv('./odds_nva.csv', sep='\t', encoding='utf-8')

odds_nva

In [None]:
# Show only significant covariates
odds_nva.loc[(odds_nva.p_value.astype(float) <= 0.05)]

### Test performance of the mortality logreg model; NVA-HARTI

In [None]:
# Split into train and test subsets
scaler = StandardScaler()
scaler.fit(logreg_data)

x_train, x_test, y_train, y_test = \
    train_test_split(logreg_data, 
                     y, test_size=0.2, random_state=0)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# Fit logreg model
model = LogisticRegression(solver='lbfgs', C=10, random_state=0, class_weight='balanced')
model.fit(x_train, y_train.values.reshape(-1))

print("Accuracy on train set; mortality model: ", model.score(x_train, y_train))
print("Accuracy of test set; mortality model: ", model.score(x_test, y_test))
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)
print("F1 score; NVA~mortality model: ", f1_score(y_test, y_pred))
print('ROC-AUC score, NVA~mortality model: ', roc_auc_score(y_test, y_pred))

In [None]:
# Show full report
print(classification_report(y_test, y_pred))

# Print coefficients
thetas = {v: k for k,v in zip(model.coef_.reshape(-1), logreg_data.columns.values.reshape(-1))}
thetas

______

# Survival curve. Kaplan-Meier model - wrong model!!

In [None]:
###### Plot Kaplan-Meier for 3 groups (VA-HARTI vs. NVA-HARTI vs. No HAI)
T = vae_data_main[['ID', 'los']].groupby('ID').max()
E = vae_data_main[['ID', 'outcome_death']].groupby('ID').max()

# fit the Kaplan-Meier with the subset of data from the class
LIM = 365
colors_sns = ['medium blue', 'orange', 'light red']
sns.set_palette(sns.xkcd_palette(colors_sns))
fig, ax = plt.subplots(1, figsize=(9, 7))

kmf_vap = KaplanMeierFitter()
kmf_vap.fit(T.loc[T.index.isin(group_ids['VA-HARTI'])],
          E.loc[E.index.isin(group_ids['VA-HARTI'])], timeline=range(0, LIM, 2), label="VA-HARTI")
kmf_vap.plot(ax=ax)

kmf_nva = KaplanMeierFitter()
kmf_nva.fit(T.loc[T.index.isin(group_ids['NVA-HARTI'])],
            E.loc[E.index.isin(group_ids['NVA-HARTI'])], timeline=range(0, LIM, 2), label="NVA-HARTI")
kmf_nva.plot(ax=ax)

kmf_nohai = KaplanMeierFitter()
kmf_nohai.fit(T.loc[T.index.isin(group_ids['No HAI'])],
              E.loc[E.index.isin(group_ids['No HAI'])], timeline=range(0, LIM, 2), label="No HAI")
kmf_nohai.plot(ax=ax)

ax.set_title('Survival probability in VA-HARTI vs No HAI patients in 2011-2018')
ax.set_ylabel('Proportion alive')
ax.grid(linestyle='dotted')

add_at_risk_counts(kmf_vap, kmf_nva, kmf_nohai, ax=ax)
plt.tight_layout()
plt.savefig('./pictures/survival_km_va-nohai.pdf', dpi=600)

In [None]:
time_limit = 365
rmst_1 = restricted_mean_survival_time(kmf_nohai, t=time_limit, return_variance=True)
rmst_4 = restricted_mean_survival_time(kmf_vap, t=time_limit, return_variance=True)
print('RMST No HAI', rmst_1[0])
print('RMST VA-HARTI', rmst_4[0])
print('STD No HAI: ', np.sqrt(rmst_1[1]))
print('STD VA-HARTI: ', np.sqrt(rmst_4[1]))

_____