# Required Libraries

In [1]:
# Data Manipulation
import pandas as pd
import numpy as np
import json

# Operating System
import os

# Convenience
from tqdm import tqdm

# import pprint
import warnings
warnings.filterwarnings('ignore')

# Configuration
pd.set_option('display.max_rows', 250)
pd.set_option('display.max_columns', 250)


from medicalbiasdetection import utils

# Global Variables
RUN = 3
RANDOM_STATE = 0

# setup configuration files
config = utils.read_yaml()
config_preprocessing = utils.read_yaml('conf/mbd_run_log.yaml')
config_run = config_preprocessing[RUN]

# Create Run Directory
utils.create_run_dir(str(RUN))

LOG_DIR = config['LOG']['dir'].format(RUN=RUN)
LOG_PATH = config['LOG']['path'].format(RUN=RUN)
os.environ['LOG_PATH'] = LOG_PATH
os.environ['RUN'] = str(RUN)

from medicalbiasdetection import process, cohort

MBD_Runs/3


## Load Reference Data

In [2]:
# get columns that will be converted to datetime 
time_cols = config['meta']['time_cols']

# get associated data types for each column
with open(config['data']['sep3_dtypes'],"r") as file:
    data_types = json.load(file)

# get lab and treatment labels    
with open(config['data']['lab_treatment_dict'],"r") as file:
    lab_trt_dict = json.load(file)

# identify the medical facility for the dataset
med_fac = 'grady' # 'grady' # 'emory'
X = cohort.load_reference_data(med_fac,config, verbose=True)

Number of encounters (csn): 119733
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 119733
Number of unique patients: 73484
Number of sepsis=0 patients: 101269 (84.58%)
Number of sepsis=1 patients: 18464 (15.42%)


## Clean Data

In [3]:
# copy static dataset
X_proc = X.copy()
n_orig = len(X_proc)

# remove non-ICU patients
X_proc = X_proc[~X_proc['first_icu_start'].isna()]
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')

# remove patients under the age of 18
X_proc = X_proc[X_proc['age']>=18]
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')


# remove csns with less than 24 hours of data
X_proc = X_proc[X_proc['hoursICU']>=24]
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')

# remove csns with unknown gender
X_proc = X_proc[X_proc['gender']<2]
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')

n_reduced = len(X_proc)
print(f"Original Size: {n_orig}")
print(f"Reduced Size: {n_reduced}")
print(f"Data Reduction: {n_orig - n_reduced}")

X = X_proc

Number of encounters (csn): 17798
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 17798
Number of unique patients: 16178
Number of sepsis=0 patients: 10538 (59.21%)
Number of sepsis=1 patients: 7260 (40.79%)
Number of encounters (csn): 17798
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 17798
Number of unique patients: 16178
Number of sepsis=0 patients: 10538 (59.21%)
Number of sepsis=1 patients: 7260 (40.79%)
Number of encounters (csn): 15182
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 15182
Number of unique patients: 13917
Number of sepsis=1 patients: 6662 (43.88%)
Number of sepsis=0 patients: 8520 (56.12%)
Number of encounters (csn): 15179
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 15179
Number of unique patients: 13914
Number of sepsis=1 patients: 6662 (43.89%)
Number of

## Identify Missingness

In [4]:
run_cell = True
debug = False

if run_cell:

    # set path for results
    TYPE = "sep_patient_missingness"
    file_dir = config['DIR']['data'].format(RUN=RUN, TYPE=TYPE)
    filename = "sepsis_patient_data_hourly.csv"
    outpath = os.path.join(file_dir,filename)
    
    # init sepsis dataframe
    sepsis_df = pd.DataFrame()
    
    # get pickle dtype map
    pkl_map = config['pkl_dtypes']

    # collect important column names
    imp1_cols = [x[0] for x in lab_trt_dict.items() if (x[1]['importance']=='1')] + ['sbp_line','dbp_line','map_line','sbp_cuff','dbp_cuff','map_cuff']
    
    # get list of years to increment for loop
    years = np.sort(X.year.unique()).tolist()

    # initialize counters
    n=0
    save_i = 0

    # loop through each year
    for year in (tqdm(years,leave=True)):
        
        # convert year to int
        year = int(year)

        # filter data by year
        data = X[X.year==year]

        # loop through each patient 
        for ind,row in data.iterrows():

            # patient csn
            csn = row.csn

            # first hour of icu status
            icu_start_time = row.first_icu_start

            # first hour of hospital admission
            hosp_admit = row.start_index

            # sepsis (bool)
            sepsis = row.sepsis

            try:
                # create path to patient ehr data
                path = config['data']['pat_pkl'].format(year=year,csn=csn)

                # ingest patient data file
                p_pkl = pd.read_pickle(path) 

                # get super_table (time-series) EMR data
                hosp_data = p_pkl["super_table"].copy()

                # assign data types to each column
                hosp_data = hosp_data.astype(pkl_map,errors='ignore')
                
                # fill appropriate na values
                hosp_data = hosp_data.fillna(value = config['fill_na'])
            
                # keep columns of importance 1
                hosp_data = hosp_data[imp1_cols]

                # assign csn
                hosp_data['csn'] = csn

                # shift df to icu start time to remove information collected outside of the icu
                hosp_data = hosp_data.loc[icu_start_time:]

                if not debug:
                    hosp_data.to_csv(outpath, mode='a', header=not os.path.exists(outpath))

                # increment patient index counter
                n+=1

                if debug:
                    stop =5
                    if n >= stop:
                        break

            except:
                continue


100%|██████████| 5/5 [30:30<00:00, 366.09s/it]


## Read Sepsis Patient Hourly Data

In [5]:
# get filepath
TYPE = "sep_patient_missingness"
file_dir = config['DIR']['data'].format(RUN=RUN, TYPE=TYPE)
filename = "sepsis_patient_data_hourly.csv"
filepath = os.path.join(file_dir,filename)

# read file
sep_df = pd.read_csv(filepath, index_col=0)

## Calculate Missingness

In [6]:
missing = pd.DataFrame(sep_df.isna().sum(axis=0)).reset_index().rename(columns={'index':'column',0:'n'})
missing['perc'] = missing['n']/sep_df.shape[0]

## ID columns with more than 75% of data missing

In [9]:
missing_cols = missing[missing['perc']>.75]['column'].tolist()
for col in missing_cols:
    print(col)

ammonia
amylase
b-type_natriuretic_peptide_(bnp)
crp_high_sens
d_dimer
dobutamine_dose_weight
dopamine_dose_weight
epinephrine_dose_weight
erythrocyte_sedimentation_rate_(esr)
fibrinogen
hemoglobin_a1c
inr
lactic_acid
lipase
norepinephrine_dose_weight
parathyroid_level
partial_prothrombin_time_(ptt)
pf_pa
phenylephrine_dose_weight
prealbumin
procalcitonin
prothrombin_time_(pt)
saturation_of_oxygen_(sao2)
thrombin_time
thyroid_stimulating_hormone_(tsh)
transferrin
troponin
vasopressin_dose_weight
sbp_line
dbp_line
map_line


## Load Reference Data

In [21]:
# get columns that will be converted to datetime 
time_cols = config['meta']['time_cols']

# get associated data types for each column
with open(config['data']['sep3_dtypes'],"r") as file:
    data_types = json.load(file)

# get lab and treatment labels    
with open(config['data']['lab_treatment_dict'],"r") as file:
    lab_trt_dict = json.load(file)

# identify the medical facility for the dataset
med_fac = 'grady' # 'grady' # 'emory'
X = cohort.load_reference_data(med_fac,config, verbose=True)

Number of encounters (csn): 119733
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 119733
Number of unique patients: 73484
Number of sepsis=0 patients: 101269 (84.58%)
Number of sepsis=1 patients: 18464 (15.42%)


## Clean Data

In [22]:
# copy static dataset
X_proc = X.copy()
n_orig = len(X_proc)

# remove non-ICU patients
X_proc = X_proc[~X_proc['first_icu_start'].isna()]
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')

# remove patients under the age of 18
X_proc = X_proc[X_proc['age']>=18]
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')


# remove csns with less than 24 hours of data
X_proc = X_proc[X_proc['hoursICU']>=24]
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')

# remove csns with unknown gender
X_proc = X_proc[X_proc['gender']<2]
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')

n_reduced = len(X_proc)
print(f"Original Size: {n_orig}")
print(f"Reduced Size: {n_reduced}")
print(f"Data Reduction: {n_orig - n_reduced}")

X = X_proc

Number of encounters (csn): 17798
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 17798
Number of unique patients: 16178
Number of sepsis=0 patients: 10538 (59.21%)
Number of sepsis=1 patients: 7260 (40.79%)
Number of encounters (csn): 17798
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 17798
Number of unique patients: 16178
Number of sepsis=0 patients: 10538 (59.21%)
Number of sepsis=1 patients: 7260 (40.79%)
Number of encounters (csn): 15182
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 15182
Number of unique patients: 13917
Number of sepsis=1 patients: 6662 (43.88%)
Number of sepsis=0 patients: 8520 (56.12%)
Number of encounters (csn): 15179
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 15179
Number of unique patients: 13914
Number of sepsis=1 patients: 6662 (43.89%)
Number of

## Calculate feature statistics

In [7]:
run_cell = False
debug = False

if run_cell:

    # get pickle dtype map
    pkl_map = config['pkl_dtypes']

    # collect important column names
    imp1_cols = [x[0] for x in lab_trt_dict.items() if (x[1]['importance']=='1')] + ['sbp_cuff','dbp_cuff','map_cuff']

    # columns to drop
    drop_missing = config['preprocess']['drop_missing']
    
    # list of feature statistics to collect for each patient
    functions = ['min','max','mean']

    # get list of years to increment for loop
    years = np.sort(X.year.unique()).tolist()

    # init counters
    n=0
    save_i = 0

    patient_features_df = pd.DataFrame()
    patient_features = []

    # loop through each year
    for year in (tqdm(years)):

        # filter data by year
        data = X[X.year==year]

        # loop through each patient 
        for ind,row in data.iterrows():

            # patient csn
            csn = row.csn
                
            # first hour of icu status
            icu_start_time = row.first_icu_start

            # sepsis [False,True]
            sepsis = row.sepsis

            try:

                # create path to patient ehr data
                path = config['data']['pat_pkl'].format(year=year,csn=csn)

                # ingest patient data file
                p_pkl = pd.read_pickle(path) 

                # get super_table (time-series) EMR data
                hosp_data = p_pkl["super_table"].copy()

                # assign data types
                hosp_data = hosp_data.astype(pkl_map,errors='ignore')
                
                # fill appropriate na values
                hosp_data = hosp_data.fillna(value = config['fill_na'])
                
                # keep columns of importance 1
                hosp_data = hosp_data[imp1_cols]

                # drop missing columns
                hosp_data = hosp_data.drop(columns=drop_missing, errors='ignore')

                pat_feat_list = hosp_data.columns

                # shift df to icu start time to remove information collected outside of the icu
                hosp_data = hosp_data.loc[icu_start_time:]

                # apply the list of functions to each column
                stats_df = hosp_data[pat_feat_list].agg(functions).T

                # stack features to create multi-index dataframe
                pat_features = stats_df.stack().to_frame().T

                # combine indexes to create column names
                pat_features.columns = ['{}_{}'.format(*col) for col in pat_features.columns]

                # assign patient csn as reference
                pat_features['csn'] = csn

                for col in pat_features.columns:
                    pat_features[col] = pd.to_numeric(pat_features[col],errors='coerce',)

                # clean aggregated patient feature data
                pat_features.fillna(np.nan, inplace=True)
                pat_features = pat_features.round(4)

                # append data to patient list
                patient_features.append(pat_features)

                # increment patient index counter
                n+=1

                if debug:
                    stop = 5
                    if n >= stop:
                        break
            except:
                continue
                
    
    # save patient features
    TYPE = 'static'
    static_dir = config['DIR']['data'].format(RUN=RUN,TYPE=TYPE)
    filename = "processed_data_patient_features.csv"
    pat_feat_outpath = os.path.join(static_dir,filename)

    # combine patient feature data into dataframe
    final = pd.concat(patient_features,axis=0)
    if not debug:
        # save
        final.to_csv(pat_feat_outpath)
    

100%|██████████| 5/5 [34:19<00:00, 411.81s/it]
