In [1]:
# Load libraries and functions
%load_ext autoreload
%autoreload 2
%matplotlib inline
RANDOM_STATE = 42  # Pseudo-random state

from utils import *
sns.set_palette("tab10") # Default seaborn theme

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# 1. Upload main data set

In [2]:
# Upload dataset
fn_vae_data = glob.glob('./data/VAE*.pkl')
latest_fn_vae_data = max(fn_vae_data, key=os.path.getctime)

print("Loading... ",latest_fn_vae_data)
with open(latest_fn_vae_data, "rb") as f:
    vae_data_main = pickle.load(f)
print("Done")


Loading...  ./data/VAE_Data_Main_0821_1338.pkl
Done


# 2. Remove patients based on exclusion criteria

In [3]:
# Remove patients who were monitored for less than 48 hours

print('48 h. Number of patients before: ', len(vae_data_main.ID.unique()))
print("48 h. Number of ICU admissions before: ", len(vae_data_main.ID_subid.unique()))

df_tmp = vae_data_main.copy()
tmp = vae_data_main.groupby('ID_subid').count().date == 1
ids_w_only_one_row = tmp[tmp == True].index

vae_data_main = vae_data_main.loc[
    ~((vae_data_main.day_in_icu_max <=1) & vae_data_main.ID_subid.isin(ids_w_only_one_row))]

print("48 h. Number of patients after: ", len(vae_data_main.ID.unique()))
print("48 h. Number of ICU admissions after: ", len(vae_data_main.ID_subid.unique()))
# 50 patients lost

48 h. Number of patients before:  2918
48 h. Number of ICU admissions before:  3231
48 h. Number of patients after:  2912
48 h. Number of ICU admissions after:  3216


In [4]:
# Censore patients' data at day 365 in ICU if a they stayed longer

vae_data_main = vae_data_main.loc[ ~(vae_data_main.day_in_icu >365)]
# 4 patients censored

In [5]:
# Remove patients who had HARTI present on admission

ids_to_drop = (vae_data_main[['ID_subid', 'infection_respiratory']].groupby('ID_subid').first() == 1.)
ids_to_drop = ids_to_drop[ids_to_drop.infection_respiratory == True].index
vae_data_main = vae_data_main[~vae_data_main.ID_subid.isin(ids_to_drop)]

print("POA, people, after: ", len(vae_data_main.ID.unique()))
print("POA, admissions, after: ", len(vae_data_main.ID_subid.unique()))
# 259 ICU admissions excluded

POA, people, after:  2708
POA, admissions, after:  2957


# 3. Count surgeries

In [6]:
# Count surgeries
def agg_func(x):
    return len(x.iloc[-1]) if type(x.iloc[-1]) == list else x.iloc[-1]

# Surgery name
sg = [col for col in vae_data_main.columns 
              if 'st_' in col 
              and 'len' not in col 
             ]   

# All surgeries
new_cols = [val + '_count' for val in sg]

# tmp df
tmp = vae_data_main.loc[:,['ID_subid'] + sg].groupby('ID_subid').agg(agg_func)

# select by row from tmp
for col, new_col in tqdm.tqdm(zip(sg, new_cols)):
    vae_data_main.loc[:,new_col] = vae_data_main.ID_subid.apply(lambda x: tmp.loc[x, col] if x in tmp.index else 0)

6it [00:01,  3.23it/s]


In [7]:
# Fill na with 0 where appropriate (monitoring columns)
vae_data_main[vae_data_main.columns[vae_data_main.columns.str.contains('count')]] = vae_data_main[vae_data_main.columns[vae_data_main.columns.str.contains('count')]].fillna(0)


In [8]:
# Calculate total number (sum) of all surgeries

sg = [col for col in vae_data_main.columns 
              if 'st_' in col
              and 'count' in col
              and '_len' not in col
              and '_null' not in col
             ]

col_name = 'st_all_sum'
vae_data_main.loc[:,col_name] = vae_data_main.loc[:, sg].sum(axis=1)

# 4. Add new aggregated/summary columns

In [9]:
# Tracheostomy median day

# Make Endotracheal tube dummies
vae_data_main = pd.get_dummies(vae_data_main, columns=['endotracheal_tube'])

values = vae_data_main.loc[vae_data_main['endotracheal_tube_3.0'] ==
                           1.,['ID_subid','date']].groupby('ID_subid').min().reset_index()

res = {}
i = 0
for uid, date in values.as_matrix():
    query_res = vae_data_main.loc[(vae_data_main.ID_subid == uid) & (vae_data_main.date < date), 'mech_vent']
    res[uid] = {
        'n_days_with': query_res.sum(), 
        'n_days_total': query_res.shape[0]}
    
res = pd.DataFrame.from_dict(res).T.replace(0., np.NaN)
res.describe()

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,n_days_with,n_days_total
count,1034.0,1064.0
mean,3.645068,4.051692
std,3.134411,3.647723
min,1.0,1.0
25%,1.0,1.0
50%,3.0,4.0
75%,5.0,5.0
max,47.0,50.0


In [10]:
# Write new columns: 'days on mechanical ventilation before tracheostomy' and 'days in ICU before tracheostomy'

vae_data_main['days_mech_vent_before_tracheostomy'] = np.NaN
vae_data_main['days_before_tracheostomy'] = np.NaN

for uid in res.index:
    vae_data_main.loc[vae_data_main.ID_subid == uid, 'days_before_tracheostomy'] = res.loc[uid,'n_days_total']
    vae_data_main.loc[vae_data_main.ID_subid == uid, 'days_mech_vent_before_tracheostomy'] = res.loc[uid,'n_days_with']
    

In [11]:
# Add flag HAI

infection_cols = vae_data_main.columns[vae_data_main.columns.str.contains('infection_')].tolist()
vae_data_main['hai'] = vae_data_main.loc[:, infection_cols].sum(axis=1)

# Collect IDs of patients without HAI
no_hai_ids = vae_data_main[['hai','ID_subid']].groupby('ID_subid').max() == 0.
no_hai_ids = no_hai_ids[no_hai_ids.hai].index

print("Number of ICU admissions without HAIs: ", len(no_hai_ids))  # 1594

Number of ICU admissions without HAIs:  1594


In [12]:
# Add antibiotics class

with open('./data/antibiotics_class.json', 'r') as f:
    antibiotics_classes = json.load(f)

for col in ["antibiotic_" + str(i) for i in range(1,5)]:
    vae_data_main[col + '_class'] = vae_data_main[col].replace(antibiotics_classes)

In [13]:
# Add column with antibiotics binary

vae_data_main['antibiotics_total_binary']= (vae_data_main.antibiotics_total >=1)

In [14]:
# Add respiratory HAI first date

vae_data_main['ir_first_date'] = None
for id_subid, time in vae_data_main.loc[vae_data_main.infection_respiratory > 0,
                                        ['ID_subid', 'date']].groupby('ID_subid').min().reset_index().values:
    vae_data_main.loc[vae_data_main.ID_subid == id_subid, 'ir_first_date'] = time

In [15]:
# Add VAP flag

class RollingCondition():
    def __init__(self):
        self.flag = False
        
    def __call__(self, x):
        if x.iloc[0] == 0:
            self.flag = True
        
        if not self.flag:
            return x.iloc[0]
        
        return 0

In [16]:
# Add VAP flag

meta_res = {}
for uid in vae_data_main.loc[(vae_data_main.infection_respiratory == 1) & (vae_data_main.mech_vent == 1)].ID_subid.unique():
    # Finding starting dates
    df = vae_data_main.loc[vae_data_main.ID_subid == uid, ['mech_vent', 'infection_respiratory', 'date', 'ID_subid']]
    starting_dates = df.loc[(df.rolling(3).mech_vent.sum() == 3.) & \
                            (df.rolling(3).infection_respiratory.agg(lambda x: x.iloc[0] == 0 and x.iloc[1] == 0 and x.iloc[2] == 1)),
                            'date']
    
    if len(starting_dates) > 0:
        res = {}
        for i, date in enumerate(starting_dates):
            res[i] = df[df.date >= date].infection_respiratory.rolling(1).agg(RollingCondition())
            
        meta_res[uid] = pd.DataFrame.from_dict(res).sum(axis=1)
        
vap_result = pd.DataFrame.from_dict(meta_res).sum(axis=1)

# Prepare column
vap_result = pd.DataFrame.from_dict(meta_res).sum(axis=1)
vap_result = pd.DataFrame(vap_result, columns=['vap_result'])

# Writing column to main data
vae_data_main['vap'] = 0.
vae_data_main['vap'] = pd.DataFrame(vae_data_main.vap.copy()).join(vap_result).sum(axis=1)

In [17]:
# Add flag for patients with non-VAP respiratory infections

a = vae_data_main.loc[(vae_data_main.vap == 0) & (vae_data_main.infection_respiratory > 0)].index
vae_data_main['non_vap_resp_hai'] = vae_data_main.index.isin(a)


# 5. Create new columns with "days with factors before HARTI"

In [18]:
# WE CALCULATE VALUES STARTING FROM THE FIRST DAY OF SURVEILLANCE HERE

# Columns with binary values
# Calculate the number of days with factor before the onset of respiratory HAI
# if no respiratory HAI - total number of days with factor

cols = [
    'mech_vent', 'central_line', 'feeding_tube', 'arterial_line', 'antibiotics_total_binary',
    'evd', 'icpm', 'urinary_catheter', 'csfl_ne', 'csfl_ss', 'hypothermia',
    'hemodialysis', 'total_parenteral_feeding',
    'intestinal_dysfunction', 'convulsions', 'sedation',
    'anxiolytics', 'aphasia', 'mutism', 'vasopressors', 'infection_cns', 
    'infection_bloodstream', 'infection_urinary', 'infection_ssi', 'infection_other']

cols += [f"endotracheal_tube_{float(i)}" for i in range(4)]

for col in cols:
    n_days = vae_data_main.loc[
        vae_data_main.date < vae_data_main.ir_first_date.apply(
            lambda x: x if x else np.datetime64('2020-01')), [col, 'ID_subid']].groupby('ID_subid').sum()

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_bid'] = vae_data_main.ID_subid.map(n_days_dict)

In [19]:
# Columns with length of stay
# Max value before the onset of respiratory HAI

cols = ['day_in_icu']

for col in cols:
    n_days = vae_data_main.loc[
        vae_data_main.date < vae_data_main.ir_first_date.apply(
            lambda x: x if x else np.datetime64('2020-01')), [col, 'ID_subid']].groupby('ID_subid').max()

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_bid'] = vae_data_main.ID_subid.map(n_days_dict)

In [20]:
# WE CALCULATE VALUES STARTING FROM THE FIRST DAY OF SURVEILLANCE HERE

# Columns with length of surgeries
# Total length before the onset of HARTI
# If no HARTI, total length of surgeries by type during the ICU admission

cols = ['st_craniotomy_len', 'st_device_len', 'st_endonasal_len',
        'st_endovascular_len', 'st_other_len', 'st_spinal_len']

def agg_func(x):
    return max(x.apply(lambda l: np.nansum(l) if isinstance(l, list) else -1))

for col in cols:
    n_days = vae_data_main.loc[
        vae_data_main.date < vae_data_main.ir_first_date.apply(
            lambda x: x if x else np.datetime64('2020-01')), [col, 'ID_subid']].groupby('ID_subid').agg(agg_func)

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_bid'] = vae_data_main.ID_subid.map(n_days_dict)

In [21]:
# Add column with sum of length of all surgeries

len_cols = ('st_craniotomy_len', 'st_device_len', 'st_endonasal_len',
           'st_endovascular_len', 'st_other_len', 'st_spinal_len')

vae_data_main['st_all_len'] = vae_data_main.loc[:, len_cols].sum(axis=1)

# Before infection
len_cols_bid = ('st_craniotomy_len_bid', 'st_device_len_bid', 'st_endonasal_len_bid',
           'st_endovascular_len_bid', 'st_other_len_bid', 'st_spinal_len_bid')

vae_data_main['st_all_len_bid'] = vae_data_main.loc[:, len_cols].sum(axis=1)

In [22]:
# Add columns with total number of surgeries (by type) before infection

cols = ['st_craniotomy', 'st_device', 'st_endonasal',
        'st_endovascular', 'st_other', 'st_spinal']

def func(x):
    return x.apply(lambda k: len(k) if isinstance(k, list) else k).max()
    
for col in cols:
    n_days = vae_data_main.loc[
        vae_data_main.date < vae_data_main.ir_first_date.apply(
            lambda x: x if x else np.datetime64('2020-01')), [col, 'ID_subid']].groupby('ID_subid').agg(func)

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_bid'] = vae_data_main.ID_subid.map(n_days_dict)
    
    
# Calculate total number (sum) of all surgeries before infection
operations = [col for col in vae_data_main.columns 
              if 'st_' in col
              and 'count' in col
              and 'bid' in col
              and '_len' not in col
              and '_name' not in col
              and '_null' not in col
             ]

col_name = 'st_all_sum_bid'
vae_data_main.loc[:,col_name] = vae_data_main.loc[:, operations].sum(axis=1)

____

In [23]:
# WE CALCULATE VALUES STARTING FROM THE FIRST DAY OF SURVEILLANCE HERE

# Columns with numeric values
# Median for the five days before the onset of HARTI
# if HARTI from day 1, fill with first day value
# if No HARTI, median of all observed values

def median_last_five_values(x):
    return x[-5:].median()

cols = ['consciousness', 'rass', 'pbss', 'charlson', 'antibiotics_total']

for col in cols:
    # Fill with zeros
    vae_data_main[col + '_bid'] = 0.

    # infected
    n_days_inf = vae_data_main.loc[
        vae_data_main.date < vae_data_main.ir_first_date.apply(
            lambda x: x if x else np.datetime64('2000-01')), [col, 'ID_subid']].groupby('ID_subid').agg(median_last_five_values)

    # To dict
    n_days_inf_dict = n_days_inf.to_dict()[col]
    
    # not infected
    n_days_not_inf = vae_data_main.loc[
        vae_data_main.date < vae_data_main.ir_first_date.apply(
            lambda x: np.datetime64('2020-01') if (not x) else np.datetime64('2000-01')), [col, 'ID_subid']].groupby('ID_subid').median()
    
    n_days_not_inf_dict = n_days_not_inf.to_dict()[col]
    
    # Sum
    inf_dict_len = len(n_days_inf_dict)
    not_inf_dict_len = len(n_days_not_inf_dict)
    
    n_days_inf_dict.update(n_days_not_inf_dict)
    
    assert len(n_days_inf_dict) == inf_dict_len + not_inf_dict_len
    
    # Write results
    vae_data_main[col + '_bid'] = vae_data_main.ID_subid.map(n_days_inf_dict)

In [24]:
# Fillna
bid_cols = vae_data_main.columns[vae_data_main.columns.str.contains('_bid')]
vae_data_main[bid_cols] = vae_data_main[bid_cols].fillna(0)
assert not vae_data_main[bid_cols].isna().max().values.max()

In [25]:
# Make dummies for columns
vae_data_main = pd.get_dummies(vae_data_main, columns=['outcome', 'gender', 'disease_type'])

In [26]:
# ADD: year column
vae_data_main['year'] = vae_data_main.date.dt.year

# ADD: yearmonth col
vae_data_main['yearmonth'] = vae_data_main.date.dt.strftime("%y%m")

# ADD: halfyear col
halfyear = ((vae_data_main.date.dt.strftime("%y").astype('int') - 11) * 12 + vae_data_main.date.dt.strftime("%m").astype('int')) // 6
vae_data_main['halfyear'] = halfyear + 1

# ADD first day in the ICU
date_series = vae_data_main[['ID_subid', 'date', 'day_in_icu']].groupby('ID_subid').first()
date_series = (date_series.date - date_series.day_in_icu.apply(lambda x: np.timedelta64(int(x)-1, 'D')))
date_dict = date_series.to_dict()
vae_data_main['first_day_in_icu'] = vae_data_main.ID_subid.map(date_dict)


# 6. Create aggregated factors

In [27]:
# Summary length of surgeries
cols = ['st_craniotomy_len', 'st_device_len', 'st_endonasal_len',
        'st_endovascular_len', 'st_other_len', 'st_spinal_len']

def agg_func(x):
    return max(x.apply(lambda l: np.nansum(l) if isinstance(l, list) else -1))

for col in cols:
    n_days = vae_data_main.loc[:, [col, 'ID_subid']].groupby('ID_subid').agg(agg_func)

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_sum'] = vae_data_main.ID_subid.map(n_days_dict)

In [28]:
# Days with factor
cols = [
    'mech_vent', 'central_line', 'feeding_tube', 'arterial_line', 'antibiotics_total_binary',
    'evd', 'icpm', 'urinary_catheter', 'csfl_ne', 'csfl_ss', 'hypothermia',
    'hemodialysis', 'total_parenteral_feeding',
    'intestinal_dysfunction', 'convulsions', 'sedation',
    'anxiolytics', 'aphasia', 'mutism', 'vasopressors', 'infection_cns', 
    'infection_bloodstream', 'infection_urinary', 'infection_ssi', 'infection_other']

cols += [f"endotracheal_tube_{float(i)}" for i in range(4)]

for col in cols:
    n_days = vae_data_main.loc[:, [col, 'ID_subid']].groupby('ID_subid').sum()

    # To dict
    n_days_dict = n_days.to_dict()[col]
    
    # Write results
    vae_data_main[col + '_d'] = vae_data_main.ID_subid.map(n_days_dict)

In [29]:
# Add groups annotation column
mask = {
    '0000': 'No HAI',
    '1011': 'NVA-HARTI',
    '1101': 'VA-HARTI',
    '0001': 'Other HAI',
    '1111': 'VA+NVA HARTI'
}
def map_label(row):
    mask_ = f'{int(row[0])}{int(row[1])}{int(row[2])}{int(bool(row[3]))}'
    return mask[mask_]

groups = vae_data_main[['ID_subid', 'infection_respiratory', 'vap',
                        'non_vap_resp_hai', 'hai']].groupby('ID_subid').max().apply(map_label, axis=1)
groups_dict = groups.to_dict()

# Map groups by ID_subid
vae_data_main['group'] = vae_data_main.ID_subid.map(groups_dict)

# 7. Save dataset

In [31]:
PATH = './data/'
os.makedirs(PATH, exist_ok=True)

FILENAME = 'Updated_VAE_Data_Main'
TIMESTAMP = datetime.datetime.now().strftime('%m%d_%H%M')

# CSV
os.path
vae_data_main.to_csv(os.path.join(PATH, '{}_{}.csv'.format(FILENAME, TIMESTAMP)))

# Pickle
with open(os.path.join(PATH, '{}_{}.pkl'.format(FILENAME, TIMESTAMP)), 'wb') as f:
    pickle.dump(vae_data_main, f)

_________