# First PaCO2 value and mortality

In [None]:
import pandas as pd
import numpy as np
import getpass
import io
import msoffcrypto
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sah_sos_data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/dci_sah/data/sos_sah_data/aSAH_DATA_2009_2023_24122023.xlsx'
abg_data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/dci_sah/data/pdms_data/Transfer Urs.pietsch@kssg.ch 22.01.24, 15_34/20240116_SAH_SOS_BGA.csv'
registry_pdms_correspondence_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/dci_sah/data/pdms_data/registry_pdms_correspondence.csv'

In [None]:
abg_df = pd.read_csv(abg_data_path, sep=';', decimal='.')

In [None]:
password = getpass.getpass()
decrypted_workbook = io.BytesIO()
with open(sah_sos_data_path, 'rb') as file:
    office_file = msoffcrypto.OfficeFile(file)
    office_file.load_key(password=password)
    office_file.decrypt(decrypted_workbook)
registry_df = pd.read_excel(decrypted_workbook, sheet_name='DATA')

In [None]:
registry_pdms_correspondence_df = pd.read_csv(registry_pdms_correspondence_path)
registry_pdms_correspondence_df['Date_birth'] = pd.to_datetime(registry_pdms_correspondence_df['Date_birth'], format='%Y-%m-%d')

In [None]:
registry_df = registry_df.merge(registry_pdms_correspondence_df, left_on=['SOS-CENTER-YEAR-NO.', 'Name', 'Date_birth'], right_on=['SOS-CENTER-YEAR-NO.', 'JoinedName', 'Date_birth'], how='left')

verify that the merge was successful (expected missing patients after 2019: 19)

In [None]:
registry_df[(registry_df['Year'] >= 2019) & (registry_df['pNr'].isnull())].shape

### Extract mortality 

restrict to 3 months after admission 

In [None]:
cap_to_max_90d = False

In [None]:
registry_df[(registry_df['pNr'].notnull())]['Days_to_Death'].describe()

In [None]:
# percentage dead
n_patients = registry_df[(registry_df['pNr'].notnull())].shape[0]
n_patients_dead = registry_df[(registry_df['pNr'].notnull()) & (registry_df['Days_to_Death'] <= 90)].shape[0]
print(f'Percentage dead: {n_patients_dead / n_patients * 100:.2f}%')
print(f'Number of patients: {n_patients}')
print(f'Number of patients dead: {n_patients_dead}')

### Extract first PaCO2 value for each patient

In [None]:
abg_df.head()

In [None]:
abg_df.bgaOrt.value_counts()

In [None]:
first_abg_df = abg_df[abg_df.bgaOrt == 'arteriell'].groupby('pNr').apply(lambda x: x.sort_values('timeBGA', ascending=True).iloc[0])
first_abg_df.reset_index(drop=True, inplace=True)

In [None]:
first_abg_df.head()

In [None]:
# join first paco2 values to registry
registry_df = registry_df.merge(first_abg_df[['pNr', 'pCO2']], left_on='pNr', right_on='pNr', how='left')

### Plot first PaCO2 value and mortality

In [None]:
sns.histplot(data=registry_df, x='pCO2', hue='Death', kde=True)

## Extract follow-up time

In [None]:
# clean date data
# replace strings with only blank space with nan
registry_df['Date_FU_1y'] = registry_df['Date_FU_1y'].replace(r'^\s*$', np.nan, regex=True)
registry_df['Date_2FU_2y'] = registry_df['Date_2FU_2y'].replace(r'^\s*$', np.nan, regex=True)

# if contains / set date to nan
registry_df['Date_2FU_2y'] = registry_df['Date_2FU_2y'].apply(lambda x: np.nan if '/' in str(x) else x)

In [None]:
date_format = '%Y-%m-%d %H:%M:%S'

In [None]:
# fill missing values of time to death
registry_df['Days_to_Death'] = registry_df['Days_to_Death'].fillna((registry_df['Date_Death'] - registry_df['Date_admission']).dt.total_seconds() / (24 * 3600))

In [None]:
# follow up times in days
registry_df['time_to_discharge'] = (registry_df['Date_Discharge'] - registry_df['Date_admission']).dt.total_seconds() / (24 * 3600)
registry_df['time_to_1y_fu'] = (pd.to_datetime(registry_df['Date_FU_1y'], format=date_format) - registry_df['Date_admission']).dt.total_seconds() / (24 * 3600)
registry_df['time_to_2y_fu'] = (pd.to_datetime(registry_df['Date_2FU_2y'], format=date_format) - registry_df['Date_admission']).dt.total_seconds() / (24 * 3600)
registry_df['max_follow_up_time'] = registry_df[['time_to_discharge', 'time_to_1y_fu', 'time_to_2y_fu']].max(axis=1)


registry_df['follow_up_time'] = registry_df['Days_to_Death'].fillna(registry_df['max_follow_up_time'])


In [None]:
# cap follow up time to 90d
if cap_to_max_90d:
    registry_df['follow_up_time'] = registry_df['follow_up_time'].clip(upper=90)

## Fit Cox Proportional Hazard Model 

## univariate model

In [None]:
univariate_predictor_df = registry_df[['pCO2', 'follow_up_time', 'Death']]
univariate_predictor_df['Death'] = univariate_predictor_df['Death'].fillna(0).astype(int)
univariate_predictor_df = univariate_predictor_df.dropna()

In [None]:
univariate_predictor_df.pCO2.describe()

In [None]:
univariate_predictor_df.head()

In [None]:
from lifelines import CoxPHFitter

cph = CoxPHFitter()
cph.fit(univariate_predictor_df, duration_col='follow_up_time', event_col='Death')
cph.print_summary()

In [None]:
cph.plot()

In [None]:
ax = cph.plot_partial_effects_on_outcome(covariates='pCO2', values=[1, 3, 5, 7])
ax.set_xlim(0, 90)
# place legend outside of plot
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

In [None]:
predicted_log_partial_hazards = cph.predict_log_partial_hazard(univariate_predictor_df)

In [None]:
# plot predicted_log_partial_hazards vs pCO2
sns.scatterplot(data=univariate_predictor_df, x='pCO2', y=predicted_log_partial_hazards)

spline model

In [None]:
# add a small positive value to zero elements in follow_up_time to avoid log(0) in the spline model
univariate_predictor_df['follow_up_time'] = univariate_predictor_df['follow_up_time'] + 1e-6

In [None]:
# find best spline df model by selecting the one with the lowest AIC
aic_df = pd.DataFrame(columns=['df', 'AIC'])

for df in range(3, 10):
    spline_cph = CoxPHFitter()
    spline_cph.fit(univariate_predictor_df, duration_col='follow_up_time', event_col='Death', formula="bs(pCO2, df={})".format(df))
    aic_df = pd.concat([aic_df, pd.DataFrame({'df': [df], 'AIC': [spline_cph.AIC_partial_]})])
    
best_df = aic_df.reset_index().loc[aic_df['AIC'].idxmin()]['df']
best_df

In [None]:
spline_cph = CoxPHFitter()
spline_cph.fit(univariate_predictor_df, duration_col='follow_up_time', event_col='Death', formula=f'bs(pCO2, df={best_df})')
spline_cph.AIC_partial_

In [None]:
spline_cph.print_summary(style='ascii')

In [None]:
# simulated data
pco2_range = np.linspace(univariate_predictor_df['pCO2'].min(), univariate_predictor_df['pCO2'].max(), 200)

# need to create a matrix of variables at their means, _except_ for age. 
x_bar = spline_cph._central_values
df_varying_pco2 = pd.concat([x_bar] * 200).reset_index(drop=True)
df_varying_pco2['pCO2'] = pco2_range

predicted_log_partial_hazards = spline_cph.predict_log_partial_hazard(df_varying_pco2)
df_varying_pco2['predicted_log_partial_hazards'] = predicted_log_partial_hazards

In [None]:
sns.scatterplot(data=df_varying_pco2, x='pCO2', y='predicted_log_partial_hazards')

In [None]:
# actual data
univariate_predictor_df_with_predicted_log_partial_hazards = univariate_predictor_df.copy()
univariate_predictor_df_with_predicted_log_partial_hazards['predicted_partial_hazards'] = spline_cph.predict_partial_hazard(univariate_predictor_df)
ax = sns.scatterplot(data=univariate_predictor_df_with_predicted_log_partial_hazards, x='pCO2', y='predicted_partial_hazards', hue='Death', alpha=0.25)

# ax.set_ylim(-0.5, 1)

## multivariate model

variables to consider for extraction: 
- ph / BE
- HR / RR 
- PEEP / Vt

#### Extract & preprocess covariates

In [None]:
bp_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/dci_sah/data/pdms_data/Transfer Urs.pietsch@kssg.ch 22.01.24, 15_34/20240116_SAH_SOS_Blutdruecke.csv'
nor_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/dci_sah/data/pdms_data/Transfer Urs.pietsch@kssg.ch 22.01.24, 15_34/20240116_SAH_SOS_EinzelGabeNoradrSpritzenpumpe.csv'
gcs_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/dci_sah/data/pdms_data/Transfer Urs.pietsch@kssg.ch 22.01.24, 15_34/20240117_SAH_SOS_GCS.csv'

In [None]:
bp_df = pd.read_csv(bp_path, sep=';', decimal='.')
nor_df = pd.read_csv(nor_path, sep=';', decimal='.')
gcs_df = pd.read_csv(gcs_path, sep=';', decimal='.')

In [None]:
studied_variable = 'pCO2'
response_variables = ['follow_up_time', 'Death']

# covariate_registry_columns = ['Age', 'Sex', 'mRS_before_ictus', 'GCS_admission', 'WFNS', 'Intubated_on_admission_YN', 'HTN', 'DM', 'Fisher_Score']
# other_covariates = ['pO2', 'mitteldruck', 'noradrenaline']

covariate_registry_columns = ['Age', 'Sex', 'mRS_before_ictus', 'GCS_admission', 'WFNS', 'Intubated_on_admission_YN', 'HTN', 'DM']
other_covariates = ['pO2', 'mitteldruck']

all_covariates = covariate_registry_columns + other_covariates

In [None]:
# removre Fisher / nor

In [None]:
# for every pre-selected covariate print number of missing values
for covariate in covariate_registry_columns:
    print(f'{covariate}: {registry_df[~registry_df.pCO2.isnull()][covariate].isnull().sum()}')

In [None]:
# fill na in age with Date_admission - Date_birth
registry_df['Age'] = registry_df['Age'].fillna((registry_df['Date_admission'] - registry_df['Date_birth']).dt.total_seconds() / (24 * 3600) / 365.25)

In [None]:
# transform Sex as binary variable (map M/m to 0 and F/f/W/w to 1)
registry_df['Sex'] = registry_df['Sex'].str.upper().map({'M': 0, 'F': 1, 'W': 1})

In [None]:
# other covariates:
# first pO2
registry_df = registry_df.merge(first_abg_df[['pNr', 'pO2']], left_on='pNr', right_on='pNr', how='left')
# set first po2 to nan if it is negative
registry_df.loc[registry_df['pO2'] < 0, 'pO2'] = np.nan

In [None]:
# first map
first_bp_df = bp_df.groupby('pNr').apply(lambda x: x.sort_values('timeBd', ascending=True).iloc[0])
first_bp_df.reset_index(drop=True, inplace=True)

registry_df = registry_df.merge(first_bp_df[['pNr', 'mitteldruck']], left_on='pNr', right_on='pNr', how='left')

In [None]:
# convert noradrenaline doses from mg to mcg
nor_df.loc[nor_df.Einheit == 'MILLIGRAM', 'Menge'] = nor_df.loc[nor_df.Einheit == 'MILLIGRAM', 'Menge'] * 1000
nor_df.loc[nor_df.Einheit == 'MILLIGRAM', 'Einheit'] = 'MICROGRAM'
# drop nan values in Einheit
nor_df = nor_df.dropna(subset=['Einheit'])
nor_df = nor_df[nor_df.Einheit == 'MICROGRAM']

concomitant noradrenaline treatment to pCO2 is defined as the presence of noradrenaline treatment within 15 minutes of the first pCO2 measurement

In [None]:
# from fist_abg_df get time and check if it is within the interval of nor_df Start (- 15 min) to nor_df Ende
nor_df = nor_df.merge(first_abg_df[['pNr', 'timeBGA']], left_on='pNr', right_on='pNr', how='left')
nor_df['Start'] = pd.to_datetime(nor_df['Start'], format='%Y-%m-%d %H:%M:%S.%f')
nor_df['Ende'] = pd.to_datetime(nor_df['Ende'], format='%Y-%m-%d %H:%M:%S.%f')
nor_df['nor_and_abg_concomitant'] = ((nor_df.timeBGA >= nor_df.Start - pd.Timedelta('15 minutes')) & (nor_df.timeBGA <= nor_df.Ende)).astype(int)

In [None]:
# merge nor_df to registry_df
nor_concomitant_df = nor_df.groupby('pNr').agg({'nor_and_abg_concomitant': 'max'}).reset_index()
registry_df = registry_df.merge(nor_concomitant_df, left_on='pNr', right_on='pNr', how='left')
registry_df.rename(columns={'nor_and_abg_concomitant': 'noradrenaline'}, inplace=True)

fill missing values in GCS and intubation status

In [None]:
gcs_df['GCS'] = gcs_df.eyes + gcs_df.verbal + gcs_df.movement
first_gcs_df = gcs_df.groupby('pNr').apply(lambda x: x.sort_values('timeGCS', ascending=True).iloc[0])
first_gcs_df.reset_index(drop=True, inplace=True)
first_gcs_df.rename(columns={'GCS': 'GCS_pdms', 'intubated': 'intubated_pdms'}, inplace=True)
registry_df = registry_df.merge(first_gcs_df[['pNr', 'GCS_pdms', 'intubated_pdms']], left_on='pNr', right_on='pNr', how='left')
registry_df['GCS_admission'] = registry_df['GCS_admission'].fillna(registry_df['GCS_pdms'])
registry_df['Intubated_on_admission_YN'] = registry_df['Intubated_on_admission_YN'].fillna(registry_df['intubated_pdms'])

create multivariate predictor df

In [None]:
multivariate_predictor_df = registry_df[covariate_registry_columns + other_covariates + [studied_variable] + response_variables]
multivariate_predictor_df['Death'] = univariate_predictor_df['Death'].fillna(0).astype(int)
multivariate_predictor_df = multivariate_predictor_df.dropna(subset=[studied_variable] + response_variables)
multivariate_predictor_df.reset_index(drop=True, inplace=True)

In [None]:
# impute missing values in covariates

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=0)

variate_df = multivariate_predictor_df[all_covariates + [studied_variable]]

imputed_variates = imputer.fit_transform(variate_df)
imputed_covariates_df = pd.DataFrame(imputed_variates, columns=all_covariates + [studied_variable])

# binarize noradrenaline
if 'noradrenaline' in all_covariates:
    imputed_covariates_df['noradrenaline'] = (imputed_covariates_df['noradrenaline'] > 0.5).astype(int)


In [None]:
imputed_multivariate_predictor_df = multivariate_predictor_df[response_variables].join(imputed_covariates_df)

In [None]:
imputed_multivariate_predictor_df.isnull().sum().sum()

### Fit linear multivariate model

In [None]:
multi_variate_cph = CoxPHFitter()
multi_variate_cph.fit(imputed_multivariate_predictor_df, duration_col='follow_up_time', event_col='Death')

In [None]:
multi_variate_cph.print_summary(style='ascii')

In [None]:
multi_variate_cph.plot()

In [None]:
ax = multi_variate_cph.plot_partial_effects_on_outcome(covariates='pCO2', values=[1, 3, 5,7])
ax.set_xlim(0, 90)
# place legend outside of plot
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

In [None]:
multivariate_predicted_partial_hazards = multi_variate_cph.predict_partial_hazard(imputed_multivariate_predictor_df)
# plot multivariate_predicted_log_partial_hazards vs pCO2
sns.scatterplot(data=imputed_multivariate_predictor_df, x='pCO2', y=multivariate_predicted_partial_hazards)

### Fit spline multivariate model

In [None]:
# add a small positive value to zero elements in follow_up_time to avoid log(0) in the spline model
imputed_multivariate_predictor_df['follow_up_time'] = imputed_multivariate_predictor_df['follow_up_time'] + 1e-6

In [None]:
all_covariates_string = ' + '.join(all_covariates)
all_covariates_string

In [None]:
# find best spline df model by selecting the one with the lowest AIC
multivariate_aic_df = pd.DataFrame(columns=['df', 'AIC', 'concordance_index'])

for df in range(3, 10):
    multi_variate_spline_cph = CoxPHFitter()
    multi_variate_spline_cph.fit(imputed_multivariate_predictor_df, duration_col='follow_up_time', event_col='Death',
                   formula=f'bs(pCO2, df={df}) + {all_covariates_string}')
    multivariate_aic_df = pd.concat([multivariate_aic_df, pd.DataFrame({'df': [df], 'AIC': [multi_variate_spline_cph.AIC_partial_],
                                                                          'concordance_index': [multi_variate_spline_cph.concordance_index_]})])

multivariate_best_df = multivariate_aic_df.reset_index().loc[multivariate_aic_df['AIC'].idxmin()]['df']
print(f'Best df: {multivariate_best_df}, with AIC: {multivariate_aic_df[multivariate_aic_df["df"] == multivariate_best_df]["AIC"].values[0]} and concordance index: {multivariate_aic_df[multivariate_aic_df["df"] == multivariate_best_df]["concordance_index"].values[0]}')

In [None]:
multi_variate_spline_cph = CoxPHFitter()
multi_variate_spline_cph.fit(imputed_multivariate_predictor_df, duration_col='follow_up_time', event_col='Death',
               formula=f'bs(pCO2, df={multivariate_best_df}) + {all_covariates_string}')
multi_variate_spline_cph.print_summary(style='ascii')

In [None]:
multi_variate_spline_cph.plot()

In [None]:
# plot survival curve for multivariate spline model
ax = multi_variate_spline_cph.plot_partial_effects_on_outcome(covariates='pCO2', values=[3, 4, 5, 6, 7], cmap='viridis')
ax.set_xlim(0, 90)


In [None]:
# get predictions for actual data
imputed_multivariate_predictor_df_with_predicted_partial_hazards = imputed_multivariate_predictor_df.copy()
imputed_multivariate_predictor_df_with_predicted_partial_hazards['predicted_partial_hazards'] = multi_variate_spline_cph.predict_partial_hazard(imputed_multivariate_predictor_df)

In [None]:
# plot on simulated data
pco2_range = np.linspace(imputed_multivariate_predictor_df['pCO2'].min(), imputed_multivariate_predictor_df['pCO2'].max(), 200)

# need to create a matrix of variables at their means, _except_ for age.
x_bar = multi_variate_spline_cph._central_values
df_varying_pco2 = pd.concat([x_bar] * 200).reset_index(drop=True)
df_varying_pco2['pCO2'] = pco2_range

predicted_partial_hazards = multi_variate_spline_cph.predict_partial_hazard(df_varying_pco2)
df_varying_pco2['predicted_partial_hazards'] = predicted_partial_hazards

sns.scatterplot(data=df_varying_pco2, x='pCO2', y='predicted_partial_hazards')
# set title
plt.title('Predicted partial hazards vs pCO2 (simulated data)')

In [None]:
# now recreate with 2 subfigures side to side with different y_lim
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

sns.scatterplot(data=imputed_multivariate_predictor_df_with_predicted_partial_hazards, x='pCO2', y='predicted_partial_hazards', hue='Death', alpha=0.25, ax=ax1)
sns.scatterplot(data=df_varying_pco2, x='pCO2', y='predicted_partial_hazards', color='red', ax=ax1, alpha=0.25, label='simulated data')

sns.scatterplot(data=imputed_multivariate_predictor_df_with_predicted_partial_hazards, x='pCO2', y='predicted_partial_hazards', hue='Death', alpha=0.25, ax=ax2)
sns.scatterplot(data=df_varying_pco2, x='pCO2', y='predicted_partial_hazards', color='red', ax=ax2, alpha=0.25, label='simulated data')
ax2.set_ylim(0, 1)

# set subtitles
ax1.set_title('Predicted partial hazards vs pCO2')
ax2.set_title('Zoomed in')

#### Relationship with other covariates

In [None]:
a1 = sns.scatterplot(data=imputed_multivariate_predictor_df_with_predicted_partial_hazards, x='pCO2', y='predicted_partial_hazards', hue='GCS_admission', alpha=0.25)
sns.scatterplot(data=df_varying_pco2, x='pCO2', y='predicted_partial_hazards', color='red', ax=ax1, alpha=0.25, label='simulated data')


# set legend outside of plot
a1.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., title='GCS at admission')
# let items of legend be only integers
handles, labels = a1.get_legend_handles_labels()
a1.legend(handles=handles[1:], labels=[int(float(label)) for label in labels[1:]], title='GCS at admission', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

In [None]:
ax = sns.scatterplot(data=imputed_multivariate_predictor_df_with_predicted_partial_hazards, x='pCO2', y='predicted_partial_hazards', hue='Intubated_on_admission_YN', alpha=0.25, palette='viridis')
# set legend outside of plot
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., title='Intubated on admission')
# set title
plt.title('Predicted partial hazards vs pCO2')


In [None]:
# check relationship with Fisher_Score
ax = sns.scatterplot(data=imputed_multivariate_predictor_df_with_predicted_partial_hazards, x='pCO2', y='predicted_partial_hazards', hue='Fisher_Score', alpha=0.25, palette='rainbow')

# let items of legend be only integers
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[2:], labels=[int(float(label)) for label in labels[2:]], title='Fisher Score', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

# set title
plt.title('Predicted partial hazards vs pCO2')


#### Fit on subset with GCS >= 12

In [None]:
imputed_multivariate_predictor_df['GCS_admission'].value_counts()

In [None]:
multivariate_predictor_non_coma_df = multivariate_predictor_df[multivariate_predictor_df['GCS_admission'] >= 12]
multivariate_predictor_non_coma_df.reset_index(drop=True, inplace=True)

In [None]:
multivariate_predictor_non_coma_df.Death.value_counts()

In [None]:
# impute missing values in covariates

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=0)

non_coma_variate_df = multivariate_predictor_non_coma_df[all_covariates + [studied_variable]]

non_coma_imputed_variates = imputer.fit_transform(non_coma_variate_df)
non_coma_imputed_covariates_df = pd.DataFrame(non_coma_imputed_variates, columns=all_covariates + [studied_variable])

# binarize noradrenaline
if 'noradrenaline' in all_covariates:
    non_coma_imputed_covariates_df['noradrenaline'] = (non_coma_imputed_covariates_df['noradrenaline'] > 0.5).astype(int)


In [None]:
non_coma_imputed_multivariate_predictor_df = multivariate_predictor_non_coma_df[response_variables].join(non_coma_imputed_covariates_df)

In [None]:
non_coma_imputed_multivariate_predictor_df.isnull().sum().sum()

In [None]:
non_coma_multi_variate_spline_cph = CoxPHFitter()
non_coma_multi_variate_spline_cph.fit(non_coma_imputed_multivariate_predictor_df, duration_col='follow_up_time', event_col='Death',
               formula=f'bs(pCO2, df={multivariate_best_df}) + {all_covariates_string}')
non_coma_multi_variate_spline_cph.print_summary(style='ascii')

#### Fit in intubated and non intubated subgroup

In [None]:
imputed_multivariate_predictor_df['Intubated_on_admission_YN'].value_counts()

In [None]:
multivariate_predictor_non_intubated_df = multivariate_predictor_df[multivariate_predictor_df['Intubated_on_admission_YN'] == 0]
multivariate_predictor_intubated_df = multivariate_predictor_df[multivariate_predictor_df['Intubated_on_admission_YN'] == 1]
multivariate_predictor_intubated_df.reset_index(drop=True, inplace=True)
multivariate_predictor_non_intubated_df.reset_index(drop=True, inplace=True)
multivariate_predictor_intubated_df.drop(columns=['Intubated_on_admission_YN'], inplace=True)
multivariate_predictor_non_intubated_df.drop(columns=['Intubated_on_admission_YN'], inplace=True)
intubation_subgroups_covariates = all_covariates.copy()
intubation_subgroups_covariates.remove('Intubated_on_admission_YN')
intubation_subgroups_covariates_string = ' + '.join(intubation_subgroups_covariates)

In [None]:
multivariate_predictor_non_intubated_df.Death.value_counts()

In [None]:
multivariate_predictor_intubated_df.Death.value_counts()

In [None]:
# impute missing values in covariates

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# impute for non intubated
imputer = IterativeImputer(max_iter=10, random_state=0)
non_intubated_variate_df = multivariate_predictor_non_intubated_df[intubation_subgroups_covariates + [studied_variable]]

non_intubated_imputed_variates = imputer.fit_transform(non_intubated_variate_df)
non_intubated_imputed_covariates_df = pd.DataFrame(non_intubated_imputed_variates, columns=intubation_subgroups_covariates + [studied_variable])

# impute for intubated
imputer = IterativeImputer(max_iter=10, random_state=0)
intubated_variate_df = multivariate_predictor_intubated_df[intubation_subgroups_covariates + [studied_variable]]

intubated_imputed_variates = imputer.fit_transform(intubated_variate_df)
intubated_imputed_covariates_df = pd.DataFrame(intubated_imputed_variates, columns=intubation_subgroups_covariates + [studied_variable])

# binarize noradrenaline
if 'noradrenaline' in intubation_subgroups_covariates:
    non_intubated_imputed_covariates_df['noradrenaline'] = (non_intubated_imputed_covariates_df['noradrenaline'] > 0.5).astype(int)
    intubated_imputed_covariates_df['noradrenaline'] = (intubated_imputed_covariates_df['noradrenaline'] > 0.5).astype(int)

In [None]:
intubated_imputed_multivariate_predictor_df = multivariate_predictor_intubated_df[response_variables].join(intubated_imputed_covariates_df)
non_intubated_imputed_multivariate_predictor_df = multivariate_predictor_non_intubated_df[response_variables].join(non_intubated_imputed_covariates_df)

In [None]:
intubated_imputed_multivariate_predictor_df.isnull().sum().sum(), non_intubated_imputed_multivariate_predictor_df.isnull().sum().sum()

In [None]:
intubated_multi_variate_spline_cph = CoxPHFitter()
intubated_multi_variate_spline_cph.fit(intubated_imputed_multivariate_predictor_df, duration_col='follow_up_time', event_col='Death',
               formula=f'bs(pCO2, df={multivariate_best_df}) + {intubation_subgroups_covariates_string}', show_progress=True)
intubated_multi_variate_spline_cph.print_summary(style='ascii')

In [None]:
non_intubated_multi_variate_spline_cph = CoxPHFitter()
non_intubated_multi_variate_spline_cph.fit(non_intubated_imputed_multivariate_predictor_df, duration_col='follow_up_time', event_col='Death',
               formula=f'bs(pCO2, df={multivariate_best_df}) + {intubation_subgroups_covariates_string}')
non_intubated_multi_variate_spline_cph.print_summary(style='ascii')

## Fit with subgroup data

thresholds for pCO2 subgroups in litterature
    - pre-hospital paper: hypocapnia (< 35 mmHg), normocapnia (35–44 mmHg), and hypercapnia (≥ 45 mmHg)
    - ENIO: normocapnia, PaCO2 > 35–45 mmHg, mild hypocapnia 32–35 mmHg, severe hypocapnia as 26– < 32 mmHg, forced hypocapnia as PaCO2 < 26 mmHg, and hypercapnia as PaCO2 > 45 mmHg
    - MIMIC: < 30, 30–35, 35–45, 45–50, and ≥ 50 mmHg

### Univariate model

In [None]:
univariate_predictor_categorized_df = univariate_predictor_df.copy()
# categorize pCO2 into < 35, 35-45, > 45 mmHg corresponding to < 4.66, 4.66-5.99, > 5.99 kPa


univariate_predictor_categorized_df['pCO2_category'] = pd.cut(univariate_predictor_categorized_df['pCO2'], bins=[0, 4.66, 5.99, 100], labels=['hypocapnia', 'normocapnia', 'hypercapnia'])
 
# further subgroups
# univariate_predictor_categorized_df['pCO2_category'] = pd.cut(univariate_predictor_categorized_df['pCO2'], bins=[0, 3.46, 4.26, 4.66, 5.99, 100], labels=['forced_hypocapnia', 'severe_hypocapnia', 'hypocapnia', 'normocapnia', 'hypercapnia'])

# one hot encoding
univariate_predictor_categorized_df = pd.get_dummies(univariate_predictor_categorized_df, columns=['pCO2_category'])
# set all columns starting with pCO2_category to int
univariate_predictor_categorized_df.loc[:, univariate_predictor_categorized_df.columns.str.startswith('pCO2_category')] = univariate_predictor_categorized_df.loc[:, univariate_predictor_categorized_df.columns.str.startswith('pCO2_category')].astype(int)
univariate_predictor_categorized_df.drop(columns=['pCO2', 'pCO2_category_normocapnia'], inplace=True)

In [None]:
univariate_categorized_cph = CoxPHFitter()
univariate_categorized_cph.fit(univariate_predictor_categorized_df, duration_col='follow_up_time', event_col='Death')
univariate_categorized_cph.print_summary(style='ascii')

In [None]:
univariate_categorized_cph.plot()

### Multivariate model

In [None]:
imputed_multivariate_predictor_categorized_df = imputed_multivariate_predictor_df.copy()
# categorize pCO2 into < 35, 35-45, > 45 mmHg corresponding to < 4.66, 4.66-5.99, > 5.99 kPa
imputed_multivariate_predictor_categorized_df['pCO2_category'] = pd.cut(imputed_multivariate_predictor_categorized_df['pCO2'], bins=[0, 4.66, 5.99, 100], labels=['hypocapnia', 'normocapnia', 'hypercapnia'])

# one hot encoding
imputed_multivariate_predictor_categorized_df = pd.get_dummies(imputed_multivariate_predictor_categorized_df, columns=['pCO2_category'])
# set all columns starting with pCO2_category to int
imputed_multivariate_predictor_categorized_df.loc[:, imputed_multivariate_predictor_categorized_df.columns.str.startswith('pCO2_category')] = imputed_multivariate_predictor_categorized_df.loc[:, imputed_multivariate_predictor_categorized_df.columns.str.startswith('pCO2_category')].astype(int)
imputed_multivariate_predictor_categorized_df.drop(columns=['pCO2', 'pCO2_category_normocapnia'], inplace=True)


In [None]:
multivariate_categorized_cph = CoxPHFitter()
multivariate_categorized_cph.fit(imputed_multivariate_predictor_categorized_df, duration_col='follow_up_time', event_col='Death')
multivariate_categorized_cph.print_summary(style='ascii')

In [None]:
multivariate_categorized_cph.plot()

Todo:

Technical:
- dummy encoding of categorical variables?
- variable selection?

Analysis:
- 30d mortality logistic regression