In [None]:
# Load libraries and functions
%load_ext autoreload
%autoreload 2
%matplotlib inline
RANDOM_STATE = 42  # Pseudo-random state

from utils import *
sns.set_palette("tab10") # Default seaborn theme

In [None]:
# Upload dataset
#### Download dataset from https://doi.org/10.5281/zenodo.5597750
### Use the filename 'vae_data_main'

# Reset index
vae_data_main = vae_data_main.reset_index(drop=True)

# Correct data type
vae_data_main.date = pd.to_datetime(vae_data_main.date, format="%Y/%m/%d")

# Remove patients based on exclusion criteria

In [None]:
# Remove patients who were monitored for less than 48 hours

print('48 h. Number of patients before: ', len(vae_data_main.ID.unique()))
print("48 h. Number of ICU admissions before: ", len(vae_data_main.ID_subid.unique()))

df_tmp = vae_data_main.copy()
tmp = vae_data_main.groupby('ID_subid').count().date == 1
ids_w_only_one_row = tmp[tmp == True].index

vae_data_main = vae_data_main.loc[
    ~((vae_data_main.day_in_icu_max <=1) & vae_data_main.ID_subid.isin(ids_w_only_one_row))]

print("\n48 h. Number of patients after: ", len(vae_data_main.ID.unique()))
print("48 h. Number of ICU admissions after: ", len(vae_data_main.ID_subid.unique()))

In [None]:
# Censore patients' data at day 365 in the ICU if they stayed longer

print("Censored patients: ", vae_data_main.loc[(vae_data_main.day_in_icu >365)].ID.unique())
vae_data_main = vae_data_main.loc[ ~(vae_data_main.day_in_icu >365)]

# Set max LOS and ICU LOS at 365 days
vae_data_main['los'].where(vae_data_main['los'] < 365, 365, inplace=True)
vae_data_main['day_in_icu_max'].where(vae_data_main['day_in_icu_max'] < 365, 365, inplace=True)

In [None]:
# Remove patients who had HARTI present on admission

ids_to_drop = (vae_data_main[['ID_subid', 'infection_respiratory']].groupby('ID_subid').first() == 1.)
ids_to_drop = ids_to_drop[ids_to_drop.infection_respiratory == True].index
vae_data_main = vae_data_main[~vae_data_main.ID_subid.isin(ids_to_drop)]

print("POA, people remained: ", len(vae_data_main.ID.unique()))
print("POA, admissions remained: ", len(vae_data_main.ID_subid.unique()))

In [None]:
# Remove COVID positive patients

covid_ids = ['2311/20', '2489/20', '2467/20', '2549/20', '2633/20', '2778/20', '3624/20', '3765/20',
             '3859/20', '3976/20', '3977/20', '5386/20', '6045/20', '6213/20', '6471/20', '5287/20',
             '6738/20', '7177/20', '6891/20', '7103/20', '7660/20', '7227/20', '6567/20', '7910/20',
             '7423/20']

vae_data_main = vae_data_main[~vae_data_main.ID.isin(covid_ids)]

print('COVID, # of pts, after', len(vae_data_main.ID.unique()))
print('COVID, # of admissions, after', len(vae_data_main.ID_subid.unique()))

# Create aggregated columns

In [None]:
# Make dummies
vae_data_main = pd.get_dummies(vae_data_main, columns=['outcome', 'gender', 'disease_type', 'endotracheal_tube'])

In [None]:
# Tracheostomy median day

values = vae_data_main.loc[vae_data_main['endotracheal_tube_3'] ==
                           1.,['ID_subid','date']].groupby('ID_subid').min().reset_index()

res = {}
i = 0
for uid, date in values.values:
    query_res = vae_data_main.loc[(vae_data_main.ID_subid == uid) & (vae_data_main.date < date), 'mech_vent']
    res[uid] = {
        'n_days_with': query_res.sum(), 
        'n_days_total': query_res.shape[0]}
    
res = pd.DataFrame.from_dict(res).T.replace(0., np.NaN)
res.describe()

In [None]:
# Write new columns: 'days on mechanical ventilation before tracheostomy' and 'days in ICU before tracheostomy'

vae_data_main['days_mech_vent_before_tracheostomy'] = np.NaN
vae_data_main['days_before_tracheostomy'] = np.NaN

for uid in res.index:
    vae_data_main.loc[vae_data_main.ID_subid == uid, 'days_before_tracheostomy'] = res.loc[uid,'n_days_total']
    vae_data_main.loc[vae_data_main.ID_subid == uid, 'days_mech_vent_before_tracheostomy'] = res.loc[uid,'n_days_with']
    

In [None]:
# Add antibiotics class

with open('antibiotics_class.json', 'r') as f:
    antibiotics_classes = json.load(f)

for col in ["antibiotic_" + str(i) for i in range(1,5)]:
    vae_data_main[col + '_class'] = vae_data_main[col].replace(antibiotics_classes)
    
# Add column with antibiotics binary
vae_data_main['antibiotics_total_binary']= (vae_data_main.antibiotics_total >=1)

In [None]:
# ADD: year column
vae_data_main['year'] = vae_data_main.date.dt.year

# ADD: yearmonth col
vae_data_main['yearmonth'] = vae_data_main.date.dt.strftime("%y%m")

# ADD: halfyear col
halfyear = ((vae_data_main.date.dt.strftime("%y").astype('int') - 11) * 12 + vae_data_main.date.dt.strftime("%m").astype('int')) // 6
vae_data_main['halfyear'] = halfyear + 1

# ADD first day in the ICU
date_series = vae_data_main[['ID_subid', 'date', 'day_in_icu']].groupby('ID_subid').first()
date_series = (date_series.date - date_series.day_in_icu.apply(lambda x: np.timedelta64(int(x)-1, 'D')))
date_dict = date_series.to_dict()
vae_data_main['first_day_in_icu'] = vae_data_main.ID_subid.map(date_dict)


In [None]:
# Days with factor during each admission

cols = [
    'mech_vent', 'central_line', 'feeding_tube', 'arterial_line', 'antibiotics_total_binary',
    'evd', 'icpm', 'urinary_catheter', 'csfl_ne', 'csfl_ss', 'hypothermia',
    'hemodialysis', 'total_parenteral_feeding',
    'intestinal_dysfunction', 'convulsions', 'sedation',
    'anxiolytics', 'aphasia', 'mutism', 'vasopressors', 'infection_cns', 
    'infection_bloodstream', 'infection_urinary', 'infection_ssi', 'infection_other',
    'endotracheal_tube_0', 'endotracheal_tube_1', 'endotracheal_tube_2', 'endotracheal_tube_3']

for col in cols:
    n_days = vae_data_main.loc[:, [col, 'ID_subid']].groupby('ID_subid').sum()

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_days'] = vae_data_main.ID_subid.map(n_days_dict)

### Surgeries aggregated columns

In [None]:
# Count surgeries
def agg_func(x):
    return len(x.iloc[-1]) if type(x.iloc[-1]) == list else x.iloc[-1]

# Surgery name
operations = [col for col in vae_data_main.columns 
              if 'st_' in col 
              and 'len' not in col 
              and 'name' not in col
              and 'icu' not in col
             ]   

# All surgeries
new_cols = [val + '_count' for val in operations]
tmp = vae_data_main.loc[:,['ID_subid'] + operations].copy().groupby('ID_subid').agg(agg_func)
for col, new_col in tqdm.tqdm(zip(operations, new_cols)):
    vae_data_main.loc[:,new_col] = vae_data_main.ID_subid.apply(lambda x: tmp.loc[x, col] if x in tmp.index else 0)
    
# Fill na with 0 where appropriate for surgery count
vae_data_main[vae_data_main.columns[vae_data_main.columns.str.contains('count')]] = vae_data_main[vae_data_main.columns[vae_data_main.columns.str.contains('count')]].fillna(0)


In [None]:
# Calculate total number (sum) of all surgeries

vae_data_main['st_all_sum'] = vae_data_main.loc[:, ('st_device_count', 'st_other_count',
                                                    'st_craniotomy_count', 'st_endovascular_count',
                                                    'st_endonasal_count', 'st_spinal_count')].sum(axis=1)

In [None]:
# Total length of surgeries by type
cols = ['st_craniotomy_len', 'st_device_len', 'st_endonasal_len',
        'st_endovascular_len', 'st_other_len', 'st_spinal_len']

def agg_func(x):
    return max(x.apply(lambda l: np.nansum(l) if isinstance(l, list) else -1))

for col in cols:
    n_days = vae_data_main.loc[:, [col, 'ID_subid']].groupby('ID_subid').agg(agg_func)

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_sum'] = vae_data_main.ID_subid.map(n_days_dict)

In [None]:
# Add total length of all surgeries
vae_data_main['st_all_len_sum'] = vae_data_main.loc[:, ('st_craniotomy_len_sum', 'st_device_len_sum', 'st_endonasal_len_sum',
        'st_endovascular_len_sum', 'st_other_len_sum', 'st_spinal_len_sum')].sum(axis=1)

vae_data_main.loc[vae_data_main['st_all_len_sum'] < 0, 'st_all_len_sum'] =0

# Flag infections

In [None]:
# Add flag HAI

infection_cols = vae_data_main.columns[vae_data_main.columns.str.contains('infection_')].tolist()
vae_data_main['hai'] = vae_data_main.loc[:, infection_cols].sum(axis=1)

# Collect IDs of patients without HAI
no_hai_ids = vae_data_main[['hai','ID_subid']].groupby('ID_subid').max() == 0.
no_hai_ids = no_hai_ids[no_hai_ids.hai].index

print("Number of ICU admissions without HAIs: ", len(no_hai_ids))


In [None]:
# Add HARTI first date
vae_data_main.date = pd.to_datetime(vae_data_main.date, format="%Y/%m/%d")

vae_data_main['harti_first_date'] = None
for id_subid, time in vae_data_main.loc[vae_data_main.infection_respiratory > 0,
                                        ['ID_subid', 'date']].groupby('ID_subid').min().reset_index().values:
    vae_data_main.loc[vae_data_main.ID_subid == id_subid, 'harti_first_date'] = time
    

In [None]:
# Add VAP flag

class RollingCondition():
    def __init__(self):
        self.flag = False
        
    def __call__(self, x):
        if x.iloc[0] == 0:
            self.flag = True
        
        if not self.flag:
            return x.iloc[0]
        
        return 0

In [None]:
meta_res = {}

def _agg_inf_respiratory(x):
    return x.iloc[0] == 0 and x.iloc[1] == 0 and x.iloc[2] == 1
    

for uid in vae_data_main.loc[(vae_data_main.infection_respiratory == 1) & (vae_data_main.mech_vent == 1)].ID_subid.unique():
    # Finding starting dates
    cols = ['mech_vent', 'infection_respiratory', 'date', 'ID_subid']
    df = vae_data_main.loc[vae_data_main.ID_subid == uid, cols]
    starting_dates = df.loc[(df.rolling(3).mech_vent.sum() == 3.) & \
                            (df.rolling(3).infection_respiratory.agg(_agg_inf_respiratory)),
                            'date']
    
    if len(starting_dates) > 0:
        res = {}
        for i, date in enumerate(starting_dates):
            res[i] = df[df.date >= date].infection_respiratory.rolling(1).agg(RollingCondition())
            
        meta_res[uid] = pd.DataFrame.from_dict(res).sum(axis=1)
        
vap_result = pd.DataFrame.from_dict(meta_res).sum(axis=1)

# Prepare column 
vap_result = pd.DataFrame.from_dict(meta_res).sum(axis=1) # was sum
vap_result = pd.DataFrame(vap_result, columns=['vap_result'])

# Writing column to main data
vae_data_main['vap'] = 0.
vae_data_main['vap'] = vae_data_main[['vap']].copy().join(vap_result).sum(axis=1).values #was sum

In [None]:
# Add flag for patients with non-VAP respiratory infections
a = vae_data_main.loc[(vae_data_main.vap == 0) & (vae_data_main.infection_respiratory > 0)].index
vae_data_main['non_vap_resp_hai'] = vae_data_main.index.isin(a)

In [None]:
# Add HARTI groups annotation column
mask = {
    '0000': 'No HAI',
    '1011': 'NVA-HARTI',
    '1101': 'VA-HARTI',
    '0001': 'Other HAI',
    '1111': 'Dual HARTI'
}
def map_label(row):
    mask_ = f'{int(row[0])}{int(row[1])}{int(row[2])}{int(bool(row[3]))}'
    return mask[mask_]

groups = vae_data_main.loc[:, ('ID_subid', 'infection_respiratory', 'vap',
                        'non_vap_resp_hai', 'hai')].groupby('ID_subid').max().apply(map_label, axis=1)
groups_dict = groups.to_dict()

# Map groups by ID_subid
vae_data_main['group'] = vae_data_main.ID_subid.map(groups_dict)

# Create new columns with "days with factors before HARTI = _bid"

In [None]:
# WE CALCULATE VALUES STARTING FROM THE FIRST DAY OF SURVEILLANCE HERE

# Columns with binary values
# Calculate the number of days with factor before the onset of respiratory HAI
# if no respiratory HAI - total number of days with factor

cols = [
    'mech_vent', 'central_line', 'feeding_tube', 'arterial_line', 'antibiotics_total_binary',
    'evd', 'icpm', 'urinary_catheter', 'csfl_ne', 'csfl_ss', 'hypothermia',
    'hemodialysis', 'total_parenteral_feeding',
    'intestinal_dysfunction', 'convulsions', 'sedation',
    'anxiolytics', 'aphasia', 'mutism', 'vasopressors', 'infection_cns', 
    'infection_bloodstream', 'infection_urinary', 'infection_ssi', 'infection_other',
    'endotracheal_tube_0', 'endotracheal_tube_1', 'endotracheal_tube_2', 'endotracheal_tube_3']

for col in cols:
    n_days = vae_data_main.loc[
        vae_data_main.date < vae_data_main.harti_first_date.apply(
            lambda x: x if x else np.datetime64('2022-01')), [col, 'ID_subid']].groupby('ID_subid').sum()

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_bid'] = vae_data_main.ID_subid.map(n_days_dict)

In [None]:
# Length of ICU stay
# Max value before the onset of respiratory HAI

cols = ['day_in_icu']

for col in cols:
    n_days = vae_data_main.loc[
        vae_data_main.date < vae_data_main.harti_first_date.apply(
            lambda x: x if x else np.datetime64('2022-01')), [col, 'ID_subid']].groupby('ID_subid').max()

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_bid'] = vae_data_main.ID_subid.map(n_days_dict)

In [None]:
# WE CALCULATE VALUES STARTING FROM THE FIRST DAY OF SURVEILLANCE HERE

# Columns with numeric values
# Median for five days before the onset of respiratory HAI
# if a patient has respiratory HAI from the first day => fill with first day value

def median_last_five_values(x):
    return np.nanmedian(x[-5:])


cols = ['gcs', 'rass', 'pbss', 'charlson', 'antibiotics_total']
vae_data_main.gcs = vae_data_main.gcs.astype('float64')

for col in cols:
    # Fill with zeros
    vae_data_main[col + '_bid'] = 0.

    # infected
    n_days_inf = vae_data_main.loc[
        vae_data_main.date < vae_data_main.harti_first_date.apply(
            lambda x: x if x else np.datetime64('2000-01')), [col, 'ID_subid']].groupby('ID_subid').agg(median_last_five_values)

    # To dict
    n_days_inf_dict = n_days_inf.to_dict()[col]
    
    # not infected
    n_days_not_inf = vae_data_main.loc[
        vae_data_main.date < vae_data_main.harti_first_date.apply(
            lambda x: np.datetime64('2022-01') if (not x) else np.datetime64('2000-01')), [col, 'ID_subid']].groupby('ID_subid').agg(np.nanmedian)
    
    n_days_not_inf_dict = n_days_not_inf.to_dict()[col]
    
    # Sum
    inf_dict_len = len(n_days_inf_dict)
    not_inf_dict_len = len(n_days_not_inf_dict)
    
    n_days_inf_dict.update(n_days_not_inf_dict)
    
    assert len(n_days_inf_dict) == inf_dict_len + not_inf_dict_len
    
    # Write results
    vae_data_main[col + '_bid'] = vae_data_main.ID_subid.map(n_days_inf_dict)
    

### Surgeries before HARTI

In [None]:
# Columns with the number of surgeries (by type) before HARTI

cols = ['st_craniotomy', 'st_device', 'st_endonasal',
        'st_endovascular', 'st_other', 'st_spinal']

def func(x):
    return x.apply(lambda k: len(k) if isinstance(k, list) else k).max()
    
for col in cols:
    n_days = vae_data_main.loc[
        vae_data_main.date < vae_data_main.harti_first_date.apply(
            lambda x: x if x else np.datetime64('2022-01')), [col, 'ID_subid']].groupby('ID_subid').agg(func)

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_bid'] = vae_data_main.ID_subid.map(n_days_dict)
    
    
# Add total length of all surgeries
vae_data_main['st_all_sum_bid'] = vae_data_main.loc[:, ('st_craniotomy_bid', 'st_device_bid',
                                                        'st_endonasal_bid', 'st_endovascular_bid',
                                                        'st_other_bid', 'st_spinal_bid')].sum(axis=1)

In [None]:
# WE CALCULATE VALUES STARTING FROM THE FIRST DAY OF SURVEILLANCE HERE

# Length of surgeries by type
# Summarize all before the onset of HARTI

len_cols = ('st_craniotomy_len', 'st_device_len', 'st_endonasal_len',
           'st_endovascular_len', 'st_other_len', 'st_spinal_len')

def agg_func(x):
    return max(x.apply(lambda l: np.nansum(l) if isinstance(l, list) else -1))

for col in len_cols:
    n_days = vae_data_main.loc[
        vae_data_main.date < vae_data_main.harti_first_date.apply(
            lambda x: x if x else np.datetime64('2022-01')), [col, 'ID_subid']].groupby('ID_subid').agg(agg_func)

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_sum_bid'] = vae_data_main.ID_subid.map(n_days_dict)
    

In [None]:
# Add column with sum of length of all surgeries before HARTI
len_cols_bid = ('st_craniotomy_len_sum_bid', 'st_device_len_sum_bid', 'st_endonasal_len_sum_bid',
           'st_endovascular_len_sum_bid', 'st_other_len_sum_bid', 'st_spinal_len_sum_bid')

vae_data_main['st_all_len_sum_bid'] = vae_data_main.loc[:, len_cols_bid].sum(axis=1)

In [None]:
# Fillna
bid_cols = vae_data_main.columns[vae_data_main.columns.str.contains('_bid')]
vae_data_main[bid_cols] = vae_data_main[bid_cols].fillna(0)
assert not vae_data_main[bid_cols].isna().max().values.max()

In [None]:
# Drop old surgeries columns and other

cols_to_drop = ['outcome_discharged', 'gender_F', 'st_device', 'st_other', 'st_craniotomy',
                'st_endovascular', 'st_endonasal', 'st_spinal', 'st_device_len', 'st_other_len',
                'st_craniotomy_len', 'st_endovascular_len', 'st_endonasal_len', 'st_spinal_len']

vae_data_main = vae_data_main.drop(columns=cols_to_drop)

print_info(vae_data_main)

In [None]:
# Replace -1 and 0 with NaN in len columns 

len_cols = ['st_craniotomy_len_sum', 'st_device_len_sum', 'st_endonasal_len_sum',
            'st_endovascular_len_sum', 'st_other_len_sum', 'st_spinal_len_sum',
            'st_all_len_sum', 'st_craniotomy_len_sum_bid', 'st_device_len_sum_bid',
            'st_endonasal_len_sum_bid', 'st_endovascular_len_sum_bid', 'st_other_len_sum_bid',
            'st_spinal_len_sum_bid', 'st_all_len_sum_bid'
]

for col in len_cols:
    vae_data_main[col].replace(-1, np.nan, inplace=True)
    vae_data_main[col].replace(0, np.nan, inplace=True)

In [None]:
# Check missing values in analytical dataset

# For columns below NaNs are if no surgery of this type
# ('st_craniotomy_len_sum', 18808, 0.35),
#  ('st_device_len_sum', 38831, 0.72),
#  ('st_endonasal_len_sum', 51548, 0.96),
#  ('st_endovascular_len_sum', 48221, 0.9),
#  ('st_other_len_sum', 47740, 0.89),
#  ('st_spinal_len_sum', 52065, 0.97),
#  ('st_all_len_sum', 8425, 0.16)

[(x,y,z) for x,y,z in zip(vae_data_main.columns, vae_data_main.isnull().values.sum(0),
                          round(vae_data_main.isnull().sum(0) / vae_data_main.shape[0], 2))]

# Save updated dataset

In [None]:
PATH = './data/'
os.makedirs(PATH, exist_ok=True)

FILENAME = 'Updated_VAE_Data_Main'
TIMESTAMP = datetime.datetime.now().strftime('%y%m%d_%H%M')

# CSV
os.path
vae_data_main.to_csv(os.path.join(PATH, '{}_{}.csv'.format(FILENAME, TIMESTAMP)))

# Pickle
with open(os.path.join(PATH, '{}_{}.pkl'.format(FILENAME, TIMESTAMP)), 'wb') as f:
    pickle.dump(vae_data_main, f)

_________