In [10]:
# Load libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import datetime
import tqdm
import pickle
import os
import glob


# To generate cases database:
# Change to True to generate cases database:
DATABASE_CASES = False


# Load and verify data

In [2]:
# Load data as pandas dataframe

########## Read name ###########

list_of_files = glob.glob('./monitoring_all_cases_*.pkl') # * means all if need specific format then *.csv
latest_file = max(list_of_files, key=os.path.getctime)
##########

print("Loading... ",latest_file)

with open(latest_file, "rb") as f:
    df = pickle.loads(f.read())

print("Done")

Loading...  ./monitoring_all_cases_0918_2236.pkl
Done


## Meet exclusion criteria

In [3]:
# Check and remove patients excluded from the study
exclude = pd.read_csv('./excluded_IDs.csv', sep=';', header=None)

monitoring_all_cases = df[~df.ID_1.isin(exclude.values.reshape(-1))]

# Print
print("Original file # of unique IDs:\t", df.ID_1.unique().shape[0])
print("New dataframe, # of unique IDs:\t", monitoring_all_cases.ID_1.unique().shape[0])

# Print
print("Renaming column 'Unnamed: 0' to 'row_idx'")
monitoring_all_cases = monitoring_all_cases.rename(columns={'Unnamed: 0': 'row_idx'}).copy()


# Here we lose 36 patients

Original file # of unique IDs:	 2324
New dataframe, # of unique IDs:	 2288
Renaming column 'Unnamed: 0' to 'row_idx'


#### Remove outlier

In [4]:
# Copy&save dataframe with new name
d = monitoring_all_cases.copy()

# remove patient with more 1500 days in ICU, he had 2 IDs while stayed
d = d[~d.ID_1.isin(['1420/16','6009/12'])]

## Meet diagnostic criteria for HAVM

#### Keep verified meningitis cases only

In [5]:
verified_cases = pd.read_csv('./verified_meningitis_IDs.csv', sep=';', header=None) 

# Take unique IDs from verified_cases
infected = pd.DataFrame(verified_cases[0].unique())
infected = infected[infected[0].isin(d.ID_1.unique())]

print("Infected in verified_cases:\t", infected[0].unique().shape[0])
print("Infected found in our dataset:\t", np.intersect1d(d.ID_1.unique(), infected[0].unique()).shape[0])

# Find the rest of IDs and correct infection_CNS status (1 to 0 if applicable)
not_infected = d.loc[~d.ID_1.isin(infected.values.reshape(-1)), 'ID_1'].unique()
d.loc[~d.ID_1.isin(infected[0]),'infection_CNS'] = 0

print("Not infected, # of IDs:\t\t", not_infected.shape[0])
print("Total\t\t\t\t", not_infected.shape[0] + infected.shape[0])


Infected in verified_cases:	 216
Infected found in our dataset:	 216
Not infected, # of IDs:		 2070
Total				 2286


_______

# Generate new features for cases / patients dateset

### Define functions

In [11]:
def return_infection_threshold(d):
    assert d.ID_1.unique().shape[0] == 1
    if not d.loc[d.infection_CNS == 1].empty:
        return d.loc[d.infection_CNS == 1, 'date'].min()
    else:
        return (X.date.astype('datetime64[ns]').max() + np.timedelta64(1,'D')).strftime("%Y-%m-%d")

In [12]:
def feature_generator(X, col_name, param_name, agg_func='sum', only_before_infection=False):
    """
    Calculate how many days param was '1' for one ID
    X -- dataframe with data
    col_name -- column where to store
    param_name -- param to count agg_func
    agg_func -- 'sum' or 'mean'
    
    """
    for idx in X.ID_1.unique():
        if agg_func == 'sum':
            if only_before_infection:
                upper_bound = return_infection_threshold(X.loc[X.ID_1 == idx])
                X.loc[X.ID_1 == idx, col_name] = X.loc[(X.ID_1 == idx) & (X.date < upper_bound), param_name].sum()
            else:
                X.loc[X.ID_1 == idx, col_name] = X.loc[X.ID_1 == idx, param_name].sum()
        elif agg_func == 'mean':
            if only_before_infection:
                upper_bound = return_infection_threshold(X.loc[X.ID_1 == idx])
                X.loc[X.ID_1 == idx, col_name] = X.loc[(X.ID_1 == idx) & (X.date < upper_bound), param_name].mean()
            else:
                X.loc[X.ID_1 == idx, col_name] = X.loc[X.ID_1 == idx, param_name].mean()
        elif agg_func == 'max':
            if only_before_infection:
                upper_bound = return_infection_threshold(X.loc[X.ID_1 == idx])
                X.loc[X.ID_1 == idx, col_name] = X.loc[(X.ID_1 == idx) & (X.date < upper_bound), param_name].max()
            else:
                X.loc[X.ID_1 == idx, col_name] = X.loc[X.ID_1 == idx, param_name].max()
        elif agg_func == 'min':
            if only_before_infection:
                upper_bound = return_infection_threshold(X.loc[X.ID_1 == idx])
                X.loc[X.ID_1 == idx, col_name] = X.loc[(X.ID_1 == idx) & (X.date < upper_bound), param_name].min()
            else:
                X.loc[X.ID_1 == idx, col_name] = X.loc[X.ID_1 == idx, param_name].min()
        elif agg_func == 'first':
            if only_before_infection:
                upper_bound = return_infection_threshold(X.loc[X.ID_1 == idx])
                X.loc[X.ID_1 == idx, col_name] = X.loc[(X.ID_1 == idx) & (X.date < upper_bound), param_name].iloc[0]
            else:
                X.loc[X.ID_1 == idx, col_name] = X.loc[X.ID_1 == idx, param_name].iloc[0]
        elif agg_func == 'first_time':
            if only_before_infection:
                upper_bound = return_infection_threshold(X.loc[X.ID_1 == idx])
                if not X.loc[(X.ID_1 == idx) & (X[param_name] == 1) & (X.date < upper_bound), 'day_in_ICU'].empty:
                    X.loc[X.ID_1 == idx, col_name] = X.loc[(X.ID_1 == idx) & (X[param_name] == 1) &
                                                           (X.date < upper_bound), 'day_in_ICU'].iloc[0],
            else:
                if not X.loc[(X.ID_1 == idx) & (X[param_name] == 1), 'day_in_ICU'].empty:
                    X.loc[X.ID_1 == idx, col_name] = X.loc[(X.ID_1 == idx) & (X[param_name] == 1), 
                                                           'day_in_ICU'].iloc[0],
        else:
            raise ValueError("use 'sum', 'mean' or 'max'")
    return X

__Function to generate cases from dataset__

In [13]:
def restore_dateindex(d):
    '''
    This functions restore missing dates within range min() - max() and 'day_in_ICU'
    d -- pd.Dataframe() with 'date' column
    '''
    d = d.sort_values('day_in_ICU')
    # Store min and max date to restore all values within this range
    date_min = d.date.min()
    date_max = d.date.max()
    
    restored_index = np.arange(date_min,
                               date_min + np.timedelta64(int(d.day_in_ICU.max() - d.day_in_ICU.min()),'D'),
                               np.timedelta64(1,'D'), dtype='datetime64')
    
    
    d = d.set_index('date')
    d = d.reindex(restored_index)
    d = d.fillna(method='ffill')
    
    d.day_in_ICU = np.arange(d.day_in_ICU.min(), d.day_in_ICU.min() + d.shape[0])
    
    return d
    
def split_dataframe(d):
    """
    Split dataframe into cases by ID
    """
    d = d.sort_values('date')
    d = d.reset_index()
    pivot_points = d[d.day_in_ICU.diff() <= 0].index
    if len(pivot_points) > 0:
        res = []
        for i in range(len(pivot_points) + 1):
            if i == 0: # Left case
                left = None
                right = pivot_points[i] - 1
            elif i == len(pivot_points): # Right case
                left = pivot_points[i-1]
                right = None
            else: # All middle cases
                left = pivot_points[i-1]
                right = pivot_points[i] - 1

            res.append(d.loc[left:right,:].copy())
            
        return res
    else:
        return [d]

In [14]:
# progress_bar
class progress_bar:
    def __init__(self):
        self.status = 0
    
    def reset(self):
        self.status = 0
    
    def pprint(self):
        if self.status == 0:
            print(datetime.datetime.now())
            self.status = datetime.datetime.now()
        else:
            print(datetime.datetime.now() - self.status)
            


___

## Compose patients dataset with new features

___________

In [11]:
# ATTENTION
if DATABASE_CASES:
    X.ID_1 = X.ID_1 + X.subid

___

### Generate and attach new features

In [10]:
X = d.copy()

pb = progress_bar()
pb.pprint()


###################################### CNS infection
# ----------------------------------------------------------------------------------
# Length of CNS unfection, total, days
for id_ in X.loc[X.infection_CNS == 1, 'ID_1'].unique():
    tmp = X.loc[X.ID_1 == id_,:]
    date_end = tmp.loc[tmp.infection_CNS == 1, 'date'].max()
    date_start = tmp['infection_CNS_date'].iloc[0]
    X.loc[X.ID_1 == id_,'infection_CNS_length'] =  date_end - date_start

# ----------------------------------------------------------------------------------
#  The length of CNS infection from first day in ICU, days; 0 if No
X = feature_generator(X, 'infection_CNS_days', 'infection_CNS', agg_func='sum')

# ----------------------------------------------------------------------------------
# On which day after the ICU admission CNS infection started. 0 if No respiratory infection
X = feature_generator(X, 'infection_CNS_1st_day', 'infection_CNS', agg_func='first_time')

print('CNS infection')
pb.pprint()

###################################################### DEVICES
########### 1 EVD
# ----------------------------------------------------------------------------------
# The length of EVD placement, days.
X = feature_generator(X, 'EVD_days', 'EVD')

# ----------------------------------------------------------------------------------
# EVD-days before infection
X = feature_generator(X, 'EVD_days_bi', 'EVD', only_before_infection=True)

# ----------------------------------------------------------------------------------

# EVD_bi == 1 всем у кого до момента infection_CNS == 1 был EVD (даже если infection_CNS всегда 0)
# EVD before infection (binary)
X.loc[:,'EVD_bi'] = X.loc[:,'EVD_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission EVD has been placed
COL_NAME = 'EVD_1st_day'
for id_ in X.loc[X.EVD == 1,'ID_1'].unique():
    X.loc[X.ID_1 == id_, COL_NAME] = X.loc[(X.ID_1 == id_) & (X.EVD == 1), 'day_in_ICU'].iloc[0]

print('1')
pb.pprint()

    
########## 2 ICP monitor
# ----------------------------------------------------------------------------------
# 11 The length of ICP-monitor placement, days.
X = feature_generator(X, 'ICP_monitor_days', 'ICP_monitor')

# ----------------------------------------------------------------------------------
# ICP-days before infection
X = feature_generator(X, 'ICP_monitor_days_bi', 'ICP_monitor', only_before_infection=True)
 
# ----------------------------------------------------------------------------------
# ICP before infection (binary)
X.loc[:,'ICP_monitor_bi'] = X.loc[:,'ICP_monitor_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission ICP monitor has been placed. 0 if No ICP-monitor
X = feature_generator(X, 'ICP_monitor_1st_day', 'ICP_monitor', agg_func='first_time')

print('2')
pb.pprint()


########### 3 Intubation tube
# ----------------------------------------------------------------------------------
# The length of intubation (or tracheostomy) tube placement, days. 0 if No intubation tube
X['intubation_tube_binary'] = X.intubation_tube.apply(lambda x: 1 if x > 0 else 0)
X = feature_generator(X, 'intubation_tube_days', 'intubation_tube_binary')

# ----------------------------------------------------------------------------------
# Intubation tube-days before infection
X = feature_generator(X, 'intubation_tube_days_bi', 'intubation_tube_binary', only_before_infection=True)
# X.drop('intubation_tube_binary', axis=1, inplace=True)

# ----------------------------------------------------------------------------------
# Intubation tube before infection (binary)
X.loc[:,'intubation_tube_bi'] = X.loc[:,'intubation_tube_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# The most frequent value in "intubation_tube"
X[pd.get_dummies(X.intubation_tube).columns.values] = pd.get_dummies(X.intubation_tube)
X = feature_generator(X, '0.0_', 0.0)
X = feature_generator(X, '1.0_', 1.0)
X = feature_generator(X, '2.0_', 2.0)
X = feature_generator(X, '3.0_', 3.0)
X['most_frequent_intubation_tube'] = [np.argmax(i) + 1 for i in X.loc[:,['0.0_', '1.0_', '2.0_', '3.0_']].as_matrix()]

cols_to_drop = [str(x) + '.0_' for x in range(4)] + [float(x) for x in range(4)]
X.drop(cols_to_drop, axis=1, inplace=True)

# ----------------------------------------------------------------------------------
# The most frequent value in "intubation_tube" BEFORE INFECTION
X[pd.get_dummies(X.intubation_tube).columns.values] = pd.get_dummies(X.intubation_tube)
X = feature_generator(X, '0.0_', 0.0, only_before_infection=True)
X = feature_generator(X, '1.0_', 1.0, only_before_infection=True)
X = feature_generator(X, '2.0_', 2.0, only_before_infection=True)
X = feature_generator(X, '3.0_', 3.0, only_before_infection=True)

X['most_frequent_intubation_tube_bi'] = [np.argmax(i) + 1 for i in X.loc[:,['0.0_',
                                                                            '1.0_', '2.0_', '3.0_']].as_matrix()]

cols_to_drop = [str(x) + '.0_' for x in range(4)] + [float(x) for x in range(4)]
X.drop(cols_to_drop, axis=1, inplace=True)

print('3')
pb.pprint()


########### 4 Central line
# ----------------------------------------------------------------------------------
# The length of central line placement, days, 0 if No central line
X = feature_generator(X, 'central_line_days', 'central_line')

# ----------------------------------------------------------------------------------
# Central line-days before infection
X = feature_generator(X, 'central_line_days_bi', 'central_line', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Central line before infection (binary)
X.loc[:,'central_line_bi'] = X.loc[:,'central_line_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission central line has been placed
X = feature_generator(X, 'central_line_1st_day', 'central_line', agg_func='first_time')

print('4')
pb.pprint()


########## 5 Arterial line
# ----------------------------------------------------------------------------------
# The length of arterial line placement, days, 0 if No central line
X = feature_generator(X, 'invasive_BP_days', 'invasive_BP')

# ----------------------------------------------------------------------------------
# Arterial line-days before infection
X = feature_generator(X, 'invasive_BP_days_bi', 'invasive_BP', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Arterial line before infection (binary)
X.loc[:,'invasive_BP_bi'] = X.loc[:,'invasive_BP_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission invasive_BP has been placed
X = feature_generator(X, 'invasive_BP_1st_day', 'invasive_BP', agg_func='first_time')

print('5')
pb.pprint()


########## 6 Urinary catheter
# ----------------------------------------------------------------------------------
# The length of urinary catheter placement, days
X = feature_generator(X, 'urinary_catheter_days', 'urinary_catheter')

# ----------------------------------------------------------------------------------
# Urinary catheter-days before infection
X = feature_generator(X, 'urinary_catheter_days_bi', 'urinary_catheter', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Urinary catheter before infection (binary)
X.loc[:,'urinary_catheter_bi'] = X.loc[:,'urinary_catheter_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission urinary_catheter has been placed
X = feature_generator(X, 'urinary_catheter_1st_day', 'urinary_catheter', agg_func='first_time')

print('6')
pb.pprint()


########## 7 Feeding tube
# ----------------------------------------------------------------------------------
# The length of feeding tube placement, days.
X = feature_generator(X, 'feeding_tube_days', 'feeding_tube')

# ----------------------------------------------------------------------------------
# Feeding tube-days before infection
X = feature_generator(X, 'feeding_tube_days_bi', 'feeding_tube', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Feeding tube before infection (binary)
X.loc[:,'feeding_tube_bi'] = X.loc[:,'feeding_tube_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission feeding_tube has been placed
X = feature_generator(X, 'feeding_tube_1st_day', 'feeding_tube', agg_func='first_time')

print('7')
pb.pprint()


######### 8 Pleural drainage
# ----------------------------------------------------------------------------------
# 26 The length of pleural drainage placement, days.
X = feature_generator(X, 'pleural_drainage_days', 'pleural_drainage')

# ----------------------------------------------------------------------------------
# Pleural drainage before infection
X = feature_generator(X, 'pleural_drainage_days_bi', 'pleural_drainage', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Pleural drainage before infection (binary)
X.loc[:,'pleural_drainage_bi'] = X.loc[:,'pleural_drainage_days_bi'].apply(lambda x: 1 if x > 0 else 0)

print('8')
pb.pprint()


########## 9 Mechanical ventilation
# ----------------------------------------------------------------------------------
# 28 The length of mechanical ventilation, days. 0 if No mechanical ventilation
X = feature_generator(X, 'mechanical_ventilation_days', 'mechanical_ventilation')

# ----------------------------------------------------------------------------------
# Ventilation-days before infection
X = feature_generator(X, 'mechanical_ventilation_days_bi', 'mechanical_ventilation', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Ventilation before infection (binary)
X.loc[:,'mechanical_ventilation_bi'] = X.loc[:,'mechanical_ventilation_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission mechanical_ventilation has been placed
X = feature_generator(X, 'mechanical_ventilation_1st_day', 'mechanical_ventilation', agg_func='first_time')

print('9')
pb.pprint()


########### 10 Other interventions
# ----------------------------------------------------------------------------------
# Other interventions before infection (binary)
cols_to_add = ['hypothermia', 'hemodialysis', 'total_parenteral_feeding',
               'purulent_sputum', 'X-ray_infiltration', 'intestinal_dysfunction',
               'convulsions_day', 'sedation', 'anxiolytics ', 'vasopressors']

for col in tqdm.tqdm(cols_to_add):
    X = feature_generator(X, col + '_days_bi', col, only_before_infection=True)
    X.loc[:, col + '_bi'] = X.loc[:, col + '_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission mechanical_ventilation has been placed
X = feature_generator(X, 'intestinal_dysfunction_1st_day', 'intestinal_dysfunction', agg_func='first_time')


print("Finished in..")
pb.pprint()


# About 30 min, 59 new features

2017-09-18 22:52:09.851826
CNS infection
0:00:47.536380
1
0:02:11.883244
2
0:03:44.357303
3
0:09:55.986182
4
0:11:59.225562
5
0:13:40.194631
6
0:15:37.465810
7
0:17:37.972152
8
0:19:01.167240


  0%|          | 0/10 [00:00<?, ?it/s]

9
0:20:59.007729


100%|██████████| 10/10 [08:20<00:00, 49.64s/it]

Finished in..
0:29:19.290031





In [11]:
# Save intermediate dataset just in case

SAVE_DUMP = True

if SAVE_DUMP:
    with open("X.pkl", 'wb') as f:
        pickle.dump(X, f)

In [12]:
# Define info function
def print_info(d):
    print("Number of Rows:\t\t", d.shape[0])
    print("Number of Columns:\t", d.shape[1])
    print("Unique IDs:\t\t", d.ID_1.unique().shape[0])
    print("Columns:\t\t\n", d.columns.values)
    
print_info(X)

Number of Rows:		 40366
Number of Columns:	 137
Unique IDs:		 2286
Columns:		
 ['ID' 'ID_1' 'date' 'year' 'month' 'day_in_ICU' 'EVD' 'ICP_monitor'
 'intubation_tube' 'central_line' 'invasive_BP' 'urinary_catheter'
 'feeding_tube' 'pleural_drainage' 'mechanical_ventilation' 'FiO2'
 'hypothermia' 'hemodialysis' 'total_parenteral_feeding'
 'tracheal_sanations' 'purulent_sputum' 'X-ray_infiltration'
 'intestinal_dysfunction' 'conscious_level' 'RASS' 'convulsions_day' 'PBSS'
 'CSF_leakage' 'wound_liquorrhea' 'sedation' 'anxiolytics ' 'vasopressors'
 'antibiotic_1' 'antibiotic_2' 'antibiotic_3' 'antibiotic_4'
 'infection_CNS' 'infection_respiratory' 'infection_bloodstream'
 'infection_urinary' 'infection_wound' 'infection_other' 'CHARLSON'
 'antibiotics_total' 'temperature' 'cases' 'day_in_ICU_max'
 'days_in_hospital' 'disease_id' 'disease_name' 'disease_type' 'gender'
 'infection_CNS_date' 'ot_craniotomy' 'ot_craniotomy_len'
 'ot_craniotomy_name' 'ot_device' 'ot_device_len' 'ot_device_name'

___________________

In [13]:
LOAD_DUMP = True
if LOAD_DUMP:
    with open("X.pkl", "rb") as f:
        bin_data = f.read()
        X = pickle.loads(bin_data)

##################################### MEASUREMENTS
pb.pprint()
########### 1 FiO2
# ----------------------------------------------------------------------------------
# Mean FiO2 of all days of mechanical ventilation.  FiO2 = 21 if No ventilation
X = feature_generator(X, 'mean_FiO2', 'FiO2', agg_func='mean')

# ----------------------------------------------------------------------------------
# Mean FiO2 before infection
X = feature_generator(X, 'mean_FiO2_bi', 'FiO2', agg_func='mean', only_before_infection=True)

print('1')
pb.pprint()


########## 2 Tracheal sanations
# ----------------------------------------------------------------------------------
# Number of tracheal sanations per day. Mean of all days of observation. 0 if no sanations
X = feature_generator(X, 'mean_tracheal_sanations_per_day', 'tracheal_sanations', agg_func='mean')

# ----------------------------------------------------------------------------------
# Mean tracheal sanations per day before infection
X = feature_generator(X, 'mean_tracheal_sanations_per_day_bi', 'tracheal_sanations',
                      agg_func='mean', only_before_infection=True)
print('2')
pb.pprint()


########## 3 Fever
# ----------------------------------------------------------------------------------
# Fever (temperature higher than 37.5C)
X['fever'] = X.temperature.apply(lambda x: int(x > 37.5))

# ----------------------------------------------------------------------------------
# The length of fever, days
X = feature_generator(X, 'fever_days', 'fever')

# ----------------------------------------------------------------------------------
# Fever-days before infection
X = feature_generator(X, 'fever_days_bi', 'fever', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Fever before infection (binary)
X.loc[:,'fever_bi'] = X.loc[:,'fever_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# Maximum body temperature
X = feature_generator(X, 'max_temperature', 'temperature', agg_func='max')

# ----------------------------------------------------------------------------------
# Here and below value  == np.NaN (max_temperature_bi), if a patient has infection since day 1 in ICU
# Maximum body temperature before infection
X = feature_generator(X, 'max_temperature_bi', 'temperature', agg_func='max',
                      only_before_infection=True)
print('3')
pb.pprint()

######## 4 Conscious level
# ----------------------------------------------------------------------------------
# Minimum conscious level. Min out of all days in the ICU
X = feature_generator(X, 'min_conscious_level', 'conscious_level', agg_func='min')

# ----------------------------------------------------------------------------------
# Minimum conscious level befoe infection
X = feature_generator(X, 'min_conscious_level_bi', 'conscious_level', agg_func='min',
                      only_before_infection=True)

# ----------------------------------------------------------------------------------
# The most frequent value in "conscious_level"
X[pd.get_dummies(X.conscious_level).columns.values] = pd.get_dummies(X.conscious_level)
X = feature_generator(X, '1.0_', 1.0)
X = feature_generator(X, '2.0_', 2.0)
X = feature_generator(X, '3.0_', 3.0)
X = feature_generator(X, '4.0_', 4.0)
X = feature_generator(X, '5.0_', 5.0)
X = feature_generator(X, '6.0_', 6.0)
X = feature_generator(X, '7.0_', 7.0)
X['most_frequent_conscious_level'] = [np.argmax(i) + 1 for i in 
                                      X.loc[:,['1.0_', '2.0_', '3.0_', '4.0_', '5.0_', '6.0_', '7.0_']].as_matrix()]

cols_to_drop = [str(x) + '.0_' for x in range(1, 8)] + [float(x) for x in range(1, 8)]
X.drop(cols_to_drop, axis=1, inplace=True)

# ----------------------------------------------------------------------------------
# The most frequent value in "conscious_level" before infection
X[pd.get_dummies(X.conscious_level).columns.values] = pd.get_dummies(X.conscious_level)
X = feature_generator(X, '1.0_', 1.0, only_before_infection=True)
X = feature_generator(X, '2.0_', 2.0, only_before_infection=True)
X = feature_generator(X, '3.0_', 3.0, only_before_infection=True)
X = feature_generator(X, '4.0_', 4.0, only_before_infection=True)
X = feature_generator(X, '5.0_', 5.0, only_before_infection=True)
X = feature_generator(X, '6.0_', 6.0, only_before_infection=True)
X = feature_generator(X, '7.0_', 7.0, only_before_infection=True)
X['most_frequent_conscious_level_bi'] = [np.argmax(i) + 1 for i in 
                                      X.loc[:,['1.0_', '2.0_', '3.0_', '4.0_', '5.0_', '6.0_', '7.0_']].as_matrix()]

cols_to_drop = [str(x) + '.0_' for x in range(1, 8)] + [float(x) for x in range(1, 8)]
X.drop(cols_to_drop, axis=1, inplace=True)

# ----------------------------------------------------------------------------------
# Length of most frequent conscious level + before infection
X.loc[:,'most_frequent_conscious_level_binary'] = X.loc[X.conscious_level == X.most_frequent_conscious_level,:]
X = feature_generator(X, 'most_frequent_conscious_level_days_bi',
                      'most_frequent_conscious_level_binary', agg_func='sum', only_before_infection=True)

print('4')
pb.pprint()


######### 5 RASS
# ----------------------------------------------------------------------------------
# Minimum score, RASS scale. Min out of all days in the ICU
X = feature_generator(X, 'min_RASS', 'RASS', agg_func='min')

# ----------------------------------------------------------------------------------
# Minimum score, RASS scale, before infection
X = feature_generator(X, 'min_RASS_bi', 'RASS', agg_func='min',
                      only_before_infection=True)
print('5')
pb.pprint()


######### 6 PBSS
# ----------------------------------------------------------------------------------
# Mean PBSS score of all days of observation.
X = feature_generator(X, 'mean_PBSS', 'PBSS', agg_func='mean')

# ----------------------------------------------------------------------------------
# Mean PBSS score before infection
X = feature_generator(X, 'mean_PBSS_bi', 'PBSS', agg_func='mean',
                      only_before_infection=True)
print('6')
pb.pprint()


######### 7 Wound liquorrhea
# ----------------------------------------------------------------------------------
# The length of wound liquorrhea, days, 0 if No wound liquorrhea
X = feature_generator(X, 'wound_liquorrhea_days', 'wound_liquorrhea')

# ----------------------------------------------------------------------------------
# Wound liquorrhea-days before infection
X = feature_generator(X, 'wound_liquorrhea_days_bi', 'wound_liquorrhea',
                      only_before_infection=True)

# ----------------------------------------------------------------------------------
# Wound liquorrhea before infection (binary)
X.loc[:,'wound_liquorrhea_bi'] = X.loc[:,'wound_liquorrhea_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission wound liquorrhea started, 0 if No wound liquorrhea
X = feature_generator(X, 'wound_liquorrhea_1st_day', 'wound_liquorrhea', agg_func='first_time')

print('7')
pb.pprint()


############ 8 CSF leakage
# ----------------------------------------------------------------------------------
# The length of CSF leakage, days, 0 if No CSF leakage
X = feature_generator(X, 'CSF_leakage_days', 'CSF_leakage')

# ----------------------------------------------------------------------------------
# CSF leakage-days before infection
X = feature_generator(X, 'CSF_leakage_days_bi', 'CSF_leakage',
                      only_before_infection=True)

# ----------------------------------------------------------------------------------
# CSF leakage before infection (binary)
X.loc[:,'CSF_leakage_bi'] = X.loc[:,'CSF_leakage_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission CSF leakage started. 0 if No CSF leakage
X = feature_generator(X, 'CSF_leakage_1st_day', 'CSF_leakage', agg_func='first_time')

print('8')
pb.pprint()


########## 9 Antibiotics
# ----------------------------------------------------------------------------------
# The length of the antibiotic course, days
X = feature_generator(X, 'a1', 'antibiotic_1')
X = feature_generator(X, 'a2', 'antibiotic_2')
X = feature_generator(X, 'a3', 'antibiotic_3')
X = feature_generator(X, 'a4', 'antibiotic_4')

X.loc[:,'antibiotics_binary'] = X.antibiotics_total.apply(lambda x: 1 if x > 0 else 0)
X = feature_generator(X, 'antibiotics_days', 'antibiotics_binary')

# ----------------------------------------------------------------------------------
# Antibiotic-days before infection
X = feature_generator(X, 'antibiotics_days_bi', 'antibiotics_binary', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Antibiotics before infection (binary)
X.loc[:,'antibiotics_bi'] = X.loc[:,'antibiotics_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# Mean number of antibiotics per day of all days of observation.
X = feature_generator(X, 'mean_antibiotics_per_day', 'antibiotics_total', agg_func='mean')

# ----------------------------------------------------------------------------------
# Mean number of antibiotics per day before infection
X = feature_generator(X, 'mean_antibiotics_per_day_bi', 'antibiotics_total',
                      agg_func='mean', only_before_infection=True)
print('9')
pb.pprint()


######## Charlson score. Does not change over time! Use first measurement
# ----------------------------------------------------------------------------------
# CHARLSON = first value for the patient
X = feature_generator(X, 'CHARLSON_FIRST', 'CHARLSON', agg_func='first')

print("Finished in..")
pb.pprint()


# About 40 min, 39 new features

0:29:21.696007
1
0:30:44.665245
2
0:32:11.889400
3
0:34:57.560403
4
0:46:34.264786
5
0:47:49.023544
6
0:49:07.930785
7
0:50:48.557845
8
0:52:25.298709
9
0:56:54.376512
Finished in..
0:57:24.084779


In [14]:
# Save intermediate dataset just in case

SAVE_DUMP_1 = True

if SAVE_DUMP_1:
    with open("X1.pkl", 'wb') as f:
        pickle.dump(X, f)

In [15]:
print_info(X)

Number of Rows:		 40366
Number of Columns:	 176
Unique IDs:		 2286
Columns:		
 ['ID' 'ID_1' 'date' 'year' 'month' 'day_in_ICU' 'EVD' 'ICP_monitor'
 'intubation_tube' 'central_line' 'invasive_BP' 'urinary_catheter'
 'feeding_tube' 'pleural_drainage' 'mechanical_ventilation' 'FiO2'
 'hypothermia' 'hemodialysis' 'total_parenteral_feeding'
 'tracheal_sanations' 'purulent_sputum' 'X-ray_infiltration'
 'intestinal_dysfunction' 'conscious_level' 'RASS' 'convulsions_day' 'PBSS'
 'CSF_leakage' 'wound_liquorrhea' 'sedation' 'anxiolytics ' 'vasopressors'
 'antibiotic_1' 'antibiotic_2' 'antibiotic_3' 'antibiotic_4'
 'infection_CNS' 'infection_respiratory' 'infection_bloodstream'
 'infection_urinary' 'infection_wound' 'infection_other' 'CHARLSON'
 'antibiotics_total' 'temperature' 'cases' 'day_in_ICU_max'
 'days_in_hospital' 'disease_id' 'disease_name' 'disease_type' 'gender'
 'infection_CNS_date' 'ot_craniotomy' 'ot_craniotomy_len'
 'ot_craniotomy_name' 'ot_device' 'ot_device_len' 'ot_device_name'

In [16]:
LOAD_DUMP_1 = True
if LOAD_DUMP_1:
    f = open("X1.pkl", "rb")
    bin_data = f.read()
    X = pickle.loads(bin_data)

########################################## INFECTIONS
pb.pprint()
############ 1 Respiratory infection
# ----------------------------------------------------------------------------------
# The length of respiratory infection, days, 0 if No respiratory infection
X = feature_generator(X, 'infection_respiratory_days', 'infection_respiratory', agg_func='sum')

# ----------------------------------------------------------------------------------
# Respiratory infection-days before infection
X = feature_generator(X, 'infection_respiratory_days_bi', 'infection_respiratory',
                      agg_func='sum', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Respiratory infection before infection (binary)
X.loc[:,'infection_respiratory_bi'] = X.loc[:,'infection_respiratory_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission respiratory infection started. 0 if No respiratory infection
X = feature_generator(X, 'infection_respiratory_1st_day', 'infection_respiratory', agg_func='first_time')

print('1')
pb.pprint()


########### 2 Bloodstream infection
# ----------------------------------------------------------------------------------
# The length of bloodstream infection, days, 0 if No bloodstream infection
X = feature_generator(X, 'infection_bloodstream_days', 'infection_bloodstream', agg_func='sum')

# ----------------------------------------------------------------------------------
# Bloodstream infection-days before infection
X = feature_generator(X, 'infection_bloodstream_days_bi', 'infection_bloodstream',
                      agg_func='sum', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Bloodstream infection before infection (binary)
X.loc[:,'infection_bloodstream_bi'] = X.loc[:,'infection_bloodstream_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# 61 On which day after the ICU admission bloodstream infection started. 0 if No bloodstream infection
X = feature_generator(X, 'infection_bloodstream_1st_day', 'infection_bloodstream', agg_func='first_time')

print('2')
pb.pprint()


############ 3 Urinary infection
# ----------------------------------------------------------------------------------
# The length of urinary infection, days, 0 if No urinary infection
X = feature_generator(X, 'infection_urinary_days', 'infection_urinary', agg_func='sum')

# ----------------------------------------------------------------------------------
# Urinary infection-days before infection
X = feature_generator(X, 'infection_urinary_days_bi', 'infection_urinary',
                      agg_func='sum', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Urinary infection before infection (binary)
X.loc[:,'infection_urinary_bi'] = X.loc[:,'infection_urinary_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission urinary infection started. 0 if No urinary infection
X = feature_generator(X, 'infection_urinary_1st_day', 'infection_urinary', agg_func='first_time')

print('3')
pb.pprint()


########## 4 Surgical site infections
# ----------------------------------------------------------------------------------
# The length of surgical site infection, days
X = feature_generator(X, 'infection_wound_days', 'infection_wound', agg_func='sum')

# ----------------------------------------------------------------------------------
# SSI-days before infection
X = feature_generator(X, 'infection_wound_days_bi', 'infection_wound',
                      agg_func='sum', only_before_infection=True)

# ----------------------------------------------------------------------------------
# SSI before infection (binary)
X.loc[:,'infection_wound_bi'] = X.loc[:,'infection_wound_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission surgical site infection started. 0 if No surgical site infection
X = feature_generator(X, 'infection_wound_1st_day', 'infection_wound', agg_func='first_time')

print('4')
pb.pprint()


############ 5 Other infections
# ----------------------------------------------------------------------------------
# The length of other infection, days	Infection monitoring. Integrated	0 if No other infection
X = feature_generator(X, 'infection_other_days', 'infection_other', agg_func='sum')

# ----------------------------------------------------------------------------------
# Other infections-days before infection
X = feature_generator(X, 'infection_other_days_bi', 'infection_other',
                      agg_func='sum', only_before_infection=True)

# ----------------------------------------------------------------------------------
# Other infections before infection (binary)
X.loc[:,'infection_other_bi'] = X.loc[:,'infection_other_days_bi'].apply(lambda x: 1 if x > 0 else 0)

# ----------------------------------------------------------------------------------
# On which day after the ICU admission other infection started. 0 if No other infection
X = feature_generator(X, 'infection_other_1st_day', 'infection_other', agg_func='first_time')

print("Finished in..")
pb.pprint()


# About 10 min, 20 new features


0:57:26.542499
1
0:59:15.911650
2
1:00:46.551381
3
1:02:29.419045
4
1:04:06.023187
Finished in..
1:05:43.751607


In [17]:
# Save intermediate dataset just in case

SAVE_DUMP_2 = True

if SAVE_DUMP_2:
    with open("X2.pkl", 'wb') as f:
        pickle.dump(X, f)

In [18]:
print_info(X)

Number of Rows:		 40366
Number of Columns:	 196
Unique IDs:		 2286
Columns:		
 ['ID' 'ID_1' 'date' 'year' 'month' 'day_in_ICU' 'EVD' 'ICP_monitor'
 'intubation_tube' 'central_line' 'invasive_BP' 'urinary_catheter'
 'feeding_tube' 'pleural_drainage' 'mechanical_ventilation' 'FiO2'
 'hypothermia' 'hemodialysis' 'total_parenteral_feeding'
 'tracheal_sanations' 'purulent_sputum' 'X-ray_infiltration'
 'intestinal_dysfunction' 'conscious_level' 'RASS' 'convulsions_day' 'PBSS'
 'CSF_leakage' 'wound_liquorrhea' 'sedation' 'anxiolytics ' 'vasopressors'
 'antibiotic_1' 'antibiotic_2' 'antibiotic_3' 'antibiotic_4'
 'infection_CNS' 'infection_respiratory' 'infection_bloodstream'
 'infection_urinary' 'infection_wound' 'infection_other' 'CHARLSON'
 'antibiotics_total' 'temperature' 'cases' 'day_in_ICU_max'
 'days_in_hospital' 'disease_id' 'disease_name' 'disease_type' 'gender'
 'infection_CNS_date' 'ot_craniotomy' 'ot_craniotomy_len'
 'ot_craniotomy_name' 'ot_device' 'ot_device_len' 'ot_device_name'

## Features from surgeries

In [15]:
# Load pkl file from step before

LOAD_DUMP_2 = True
if LOAD_DUMP_2:
    f = open("X2.pkl", "rb")
    bin_data = f.read()
    X = pickle.loads(bin_data)


__Total number of surgeris (each type and all together)__

In [16]:
# Count surgeries
def agg_func(x):
    # Возвращает длину списка в последней строке или np.Nan если в последней строке пусто
    # Работает сразу для нескольких колонок
    return len(x.iloc[-1]) if type(x.iloc[-1]) == list else x.iloc[-1]

# Surgery name
operations = [col for col in X.columns 
              if 'ot_' in col 
              and 'len' not in col 
              and 'name' not in col
             ]

# Before infection only
new_cols = [val + '_count_bi' for val in operations]

# Получаем результат во временный датафрейм
tmp = X.loc[X.infection_CNS < 1,
      ['ID_1'] + operations].groupby('ID_1').agg(agg_func)

# Записываем вытаскивая построчно из tmp
for col, new_col in tqdm.tqdm(zip(operations, new_cols)):
    X.loc[:,new_col] = X.ID_1.apply(lambda x: tmp.loc[x,col] if x in tmp.index else np.nan)

    

# All surgeries
new_cols = [val + '_count' for val in operations]

# Получаем результат во временный датафрейм
tmp = X.loc[:,['ID_1'] + operations].groupby('ID_1').agg(agg_func)

# Записываем вытаскивая построчно из tmp
for col, new_col in tqdm.tqdm(zip(operations, new_cols)):
    X.loc[:,new_col] = X.ID_1.apply(lambda x: tmp.loc[x, col] if x in tmp.index else np.nan)

7it [00:06,  1.05it/s]
7it [00:06,  1.07it/s]


In [17]:
# Make binary from all "ot_..._count" columns, before infection and all time
operations = [col for col in X.columns 
              if 'ot_' in col 
              and 'count' in col
             ]

new_cols = [col + '_binary' for col in X.columns 
            if 'ot_' in col
            and 'count' in col
           ]

for col, new_col in zip(operations, new_cols):
    X.loc[:,new_col] = (X.loc[:,col] > 0).astype('int64')

In [18]:
# Calculate total number (sum) of all surgeries 

# Before infection
operations = [col for col in X.columns 
              if 'ot_' in col
              and 'count_bi' in col
              and '_len' not in col
              and '_name' not in col
              and '_null' not in col
             ]

col_name = 'ot_all_count_sum_bi'
X.loc[:,col_name] = X.loc[:, operations].sum(axis=1)


# During all time
operations = [col for col in X.columns 
              if 'ot_' in col
              and 'count' in col
              and '_bi' not in col
              and '_len' not in col
              and '_name' not in col
              and '_null' not in col
             ]

col_name = 'ot_all_count_sum'
X.loc[:,col_name] = X.loc[:, operations].sum(axis=1)

__ Total length of surgeris (each type)__

In [19]:
def agg_func(x):
    # Возвращает длину списка в последней строке или np.Nan если в последней строке пусто
    # Работает сразу для нескольких колонок
    return sum(x.iloc[-1]) if type(x.iloc[-1]) == list else x.iloc[-1]

# Before infection
# Названия операций
operations = [col for col in X.columns 
              if 'ot_' in col 
              and 'len' in col 
              and 'name' not in col
              and 'count' not in col]

new_cols = [val + '_sum_bi' for val in operations]

# Получаем результат во временный датафрейм
tmp = X.loc[X.infection_CNS < 1,
      ['ID_1'] + operations].groupby('ID_1').agg(agg_func)

# Записываем вытаскивая по-строчно из tmp
for col, new_col in zip(operations, new_cols):
    X.loc[:,new_col] = X.ID_1.apply(lambda x: tmp.loc[x,col] if x in tmp.index else np.nan)

    

# During all time
# Названия операций
operations = [col for col in X.columns 
              if 'ot_' in col 
              and 'len' in col 
              and 'name' not in col
              and 'count' not in col
              and '_bi' not in col
             ]    
new_cols = [val + '_sum' for val in operations]
    
# Получаем результат во временный датафрейм
tmp = X.loc[:,['ID_1'] + operations].groupby('ID_1').agg(agg_func)

# Записываем вытаскивая построчно из tmp
for col, new_col in tqdm.tqdm(zip(operations, new_cols)):
    X.loc[:,new_col] = X.ID_1.apply(lambda x: tmp.loc[x, col] if x in tmp.index else np.nan)

7it [00:07,  1.12s/it]


### Other important new features

In [20]:
# Calculate total number of days in ICU for patient

for idx in tqdm.tqdm(X.ID_1.unique()):
    result = 0
    for case in X.loc[X.ID_1 == idx, 'subid'].unique():
        result = result + X.loc[(X.ID_1 == idx) & (X.subid == case),'day_in_ICU'].max()
    X.loc[X.ID_1 == idx, 'days_in_ICU_total'] = result

100%|██████████| 2286/2286 [00:48<00:00, 46.84it/s]


In [21]:
# Repeated craniotomy before infection

X.loc[:,'recraniotomy_bi'] = X.loc[:,'ot_craniotomy_count_bi'].apply(lambda x: 1 if x >= 2 else 0)

In [22]:
# On which day after the ICU admission mechanical_ventilation has been placed

X = feature_generator(X, 'intestinal_dysfunction_1st_day', 'intestinal_dysfunction', agg_func='first_time')


____

In [23]:
# Group data by ID_1 and create final dataset

def agg_func(x):
    if 'ot_' in x.name:
        return x.iloc[-1]
    return x.max()

X_prime = X.groupby('ID_1').agg(agg_func)

X_prime.shape

(2286, 242)

In [24]:
# drop unnecessary columns
COLS_TO_DROP = ['day_in_ICU_max', 'ot_null_count',
                'ot_null_len', 'ot_null_len_sum', 'ot_null', 'ot_null_name',
               'ot_null_count_bi', 'ot_null_len_sum_bi', 'ot_null_count_bi_binary',]

X_prime = X_prime.loc[:, ~X_prime.columns.isin(COLS_TO_DROP)]
X_prime.shape

(2286, 233)

In [25]:
# Check that operation-related columns before infection and all together are different

(X_prime.loc[:, 'ot_device_count'] - X_prime.loc[:, 'ot_device_count_bi']).unique()

array([  0.,  nan,   4.,   3.,   2.,   1.])

### Save new monitoring & patients/cases datasets

In [26]:
# With new version every time

X.to_csv('Monitoring_new_features_' + datetime.datetime.now().strftime('%m%d_%H%M') + '.csv')
with open('Monitoring_new_features_' + datetime.datetime.now().strftime('%m%d_%H%M') + '.pkl',
          'wb') as f:
    pickle.dump(X, f)

if DATABASE_CASES:
    X_prime.to_csv('Cases_' + datetime.datetime.now().strftime('%m%d_%H%M') + '.csv')
    with open('Cases_' + datetime.datetime.now().strftime('%m%d_%H%M') + '.pkl',
          'wb') as f:
        pickle.dump(X_prime, f)
else:
    X_prime.to_csv('Patients_' + datetime.datetime.now().strftime('%m%d_%H%M') + '.csv')
    with open('Patients_' + datetime.datetime.now().strftime('%m%d_%H%M') + '.pkl', 'wb') as f:
        pickle.dump(X_prime, f)


___