# Required Libraries

In [1]:
# Data Manipulation
import pandas as pd
import numpy as np
import json

# Operating System
import os

# Convenience
from tqdm import tqdm
# import pprint
import warnings
warnings.filterwarnings('ignore')

# Configuration
pd.set_option('display.max_rows', 250)
pd.set_option('display.max_columns', 250)

from medicalbiasdetection import utils

# Global Variables
RUN = 1
RANDOM_STATE = 0

# setup configuration files
config = utils.read_yaml()
config_preprocessing = utils.read_yaml('conf/mbd_run_log.yaml')

# Create Run Directory
utils.create_run_dir(str(RUN))

LOG_DIR = config['LOG']['dir'].format(RUN=RUN)
LOG_PATH = config['LOG']['path'].format(RUN=RUN)
os.environ['LOG_PATH'] = LOG_PATH
os.environ['RUN'] = str(RUN)


from medicalbiasdetection import process, cohort

MBD_Runs/1
Directory Already Exists


## Load Reference Data

In [5]:
# identify the medical facility for the dataset
med_fac = 'grady' # 'grady' # 'emory'
X = cohort.load_reference_data(med_fac,config, verbose=True)

Number of encounters (csn): 119733
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 119733
Number of unique patients: 73484
Number of sepsis=0 patients: 101269 (84.58%)
Number of sepsis=1 patients: 18464 (15.42%)


## Reset CSN Log

In [6]:
run_cell = True
if run_cell:
    cohort.create_csn_log(X, LOG_DIR)

Log file created at: 
 MBD_Runs/1/data/log/csn_log.csv


## Clean Data

In [7]:
# copy static dataset
X_proc = X.copy()
n_orig = len(X_proc)

# set step variable
step = 'preprocessing'

# remove non-ICU patients
drop_csns = X_proc[X_proc['first_icu_start'].isna()]['csn'].unique().tolist()
X_proc = cohort.remove_csn(X_proc, drop_csns, step,'non-ICU patient')
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')

# remove patients under the age of 18
drop_csns = X_proc[X_proc['age']<18]['csn'].unique().tolist()
X_proc = cohort.remove_csn(X_proc, drop_csns, step,'under 18 years of age')
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')


# remove csns with less than 24 hours of data
drop_csns = X_proc[X_proc['hoursICU']<24]['csn'].unique().tolist()
X_proc = cohort.remove_csn(X_proc, drop_csns, step, 'less than 24 hours of icu data')
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')

# remove csns with unknown gender
# drop_csns = X_proc[X_proc['gender']>1]['csn'].unique().tolist()
drop_csns = X_proc[(X_proc['gender']!=0)&(X_proc['gender']!=1)]['csn'].unique().tolist()
X_proc = cohort.remove_csn(X_proc, drop_csns, step, 'gender unknown')
# print csn report
cohort.print_cohort_report(X_proc,'csn','pat_id','sepsis')

n_reduced = len(X_proc)
print(f"Original Size: {n_orig}")
print(f"Reduced Size: {n_reduced}")
print(f"Data Reduction: {n_orig - n_reduced}")

X = X_proc

Log file updated.
Number of encounters (csn): 17798
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 17798
Number of unique patients: 16178
Number of sepsis=0 patients: 10538 (59.21%)
Number of sepsis=1 patients: 7260 (40.79%)
Number of encounters (csn): 17798
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 17798
Number of unique patients: 16178
Number of sepsis=0 patients: 10538 (59.21%)
Number of sepsis=1 patients: 7260 (40.79%)
Log file updated.
Number of encounters (csn): 15182
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 15182
Number of unique patients: 13917
Number of sepsis=1 patients: 6662 (43.88%)
Number of sepsis=0 patients: 8520 (56.12%)
Log file updated.
Number of encounters (csn): 15179
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 15179
Number of unique patients: 1391

## Check CSN logs

In [8]:
X = cohort.update_cohort(X,verbose=True)

Total CSNs included: 15179
Removed CSNs:
step           reason                        
preprocessing  non-ICU patient                   101935
               less than 24 hours of icu data      2616
               gender unknown                         3
dtype: int64


# Process Data

In [9]:
run_cell = True
debug = False

if run_cell:
    
    # establish parameters
    prediction_lags = [6]

    # set window size
    window = 6

    # get pickle dtype map
    path = config['data']['pkl_dtypes']
    with open(path,"r") as file:
        pkl_map = json.load(file)

    # collect important column names
    imp1_cols = [x[0] for x in lab_trt_dict.items() if (x[1]['importance']=='1')] + ['sbp_line','dbp_line','map_line']
    
    # get feature informative missing columns
    fim_cols = config['preprocess']['fim_cols']
    
    #  get vital sign columns
    vital_cols = config['preprocess']['vital_cols']

    # get high missing variables to remove
    drop_missing = config['preprocess']['drop_missing']
    
    # get remaining variables to remove
    drop_cols = config['preprocess']['drop_cols']
    
    # get list of years to increment for loop
    years = np.sort(X.year.unique()).tolist()

    # initialize counters
    n=0
    save_i = 0

    # loop through each year
    for year in (tqdm(years,leave=True)):

        # filter data by year
        data = X[X.year==year]

        # loop through each patient 
        for ind,row in data.iterrows():

            # patient csn
            csn = row.csn

            # first hour of icu status
            icu_start_time = row.first_icu_start

            # first hour of hospital admission
            hosp_admit = row.start_index

            # sepsis (bool)
            sepsis = row.sepsis

            # sofa2 (bool)
            sofa_2 = row.sofa_2

            # set values for sofa and sepsis times
            sofa_2_time = None
            sep_time = None

            try:
                # create path to patient ehr data
                path = config['data']['pat_pkl'].format(year=year,csn=csn)

                # ingest patient data file
                p_pkl = pd.read_pickle(path) 

                # get super_table (time-series) EMR data
                hosp_data = p_pkl["super_table"].copy()

                # assign data types to each column
                hosp_data = hosp_data.astype(pkl_map,errors='ignore')

                # keep columns of importance 1
                hosp_data = hosp_data[imp1_cols]

                # assign csn
                hosp_data['csn'] = csn

                # create sepsis labels, assign all equal to 0
                hosp_data['sepsis'] = 0

                # set lagged sepsis values to 0
                for lag in prediction_lags: 
                    hosp_data[f'sepsis_lag_{lag}'] = 0

                # create sofa score time, assign time labels to 0
                hosp_data['sofa_2_time'] = 0

                # get sofa and sirs score totals
                hosp_data['sofa_score_total']= p_pkl['sofa_scores']['hourly_total_mod'].fillna(0.0).tolist()
                hosp_data['sirs_score_total']= p_pkl['sirs_scores']['hourly_total'].fillna(0.0).tolist()

                # if the patient was identified as having sepsis during their ICU time
                if sepsis: 
                    # set sepsis time
                    sep_time = row.first_sep3_time

                    # adjust the sepsis time if it is not a time in the index
                    if sep_time not in hosp_data.index:

                        # find the next closest time slot available if the sepsis time is not in the patient's indexed times
                        sep_time = process.nextClosestTime(hosp_data.index,sep_time)

                    # set sepsis time to 1 beginning from first sepsis hour to the end of patient's stay
                    hosp_data.loc[sep_time:,'sepsis'] = 1

                    # set sepsis lags respectively
                    for lag in prediction_lags:
                        hosp_data[f'sepsis_lag_{lag}'] = hosp_data['sepsis'].shift(-lag).ffill()
                    if debug:
                        print("Lags updated")

                # if patient has a sofa score, create sofa_2 labels
                if sofa_2: 
                    # set patient sofa 2 time
                    sofa_2_time = row.sofa_2_time

                    # adjust the sofa 2 time if it is not a time in the index
                    if sofa_2_time not in hosp_data.index:
                        # find the next closest time slot available if the sofa 2 time is not in the patient's indexed times
                        sofa_2_time = process.nextClosestTime(hosp_data.index,sofa_2_time)

                    # set sofa 2 time to 1 beginning from first sofa time to the end of patient's stay
                    hosp_data.loc[sofa_2_time:,'sofa_2_time'] = 1

                    if debug:
                        print("Sofa 2 Information updated")

                # shift df to icu start time to remove information collected outside of the icu
                hosp_data = hosp_data.loc[icu_start_time:]

                # copy timestamps from index to col times
                hosp_data['timestamp'] = hosp_data.index

                # forward fill feature informative missingness columns
                hosp_data[imp1_cols] = hosp_data[imp1_cols].ffill()

                #Add feature informative missining columns
                feat_info_missing_df = process.feature_informative_missingness(hosp_data[fim_cols])

                # combine fim data and hospital data
                hosp_data = hosp_data.merge(feat_info_missing_df, on=fim_cols)

                # remove heavily missing variates
                hosp_data = hosp_data.drop(columns=drop_missing, errors='ignore')

                # drop duplicates
                hosp_data = hosp_data.drop_duplicates(subset=['timestamp'])

                # sort data based on time
                hosp_data = hosp_data.sort_values(by=['timestamp'])

                # reset the index
                hosp_data = hosp_data.reset_index()

                #Add feature sliding window data
                hosp_data = pd.concat([hosp_data, process.feature_slide_window(hosp_data[vital_cols],window)], axis=1)

                # Add feature empiric scores
                hosp_data = process.feature_empiric_score(hosp_data)

                # replace current index with patient index id and icu hour "0_0" = patient index 0 at hour 0
                hosp_data.index = [f"{n}_{x}" for x in range(len(hosp_data))]

                # reduce dataset to specified hour upper limits
                hosp_data = hosp_data.iloc[:168]
                
                # drop remaining unnecessary variables
                hosp_data = hosp_data.drop(columns=drop_cols, errors='ignore')

                # save data after each set of 3000 patients
                if n%3000 == 0:
                    print(f"Patient Data CSV {save_i} Complete")
                    save_i += 1
                    print(f"Starting Patient Data CSV {save_i}")

                # set data type for saving
                TYPE = 'hourly'
                # set directory path
                outpath_dir = config['DIR']['data'].format(RUN=RUN, TYPE=TYPE)
                # set filename
                filename = f"processed_data_{save_i}.csv"
                # create save path
                outpath = os.path.join(outpath_dir,filename)

                if not debug:
                    hosp_data.to_csv(outpath, mode='a', header=not os.path.exists(outpath))

                # increment patient index counter
                n+=1

                if debug:
                    stop =2
                    if n >= stop:
                        break

            except:
                # set data type for saving
                TYPE = 'skipped'

                # set directory path
                outpath_dir = config['DIR']['data'].format(RUN=RUN, TYPE=TYPE)

                # set filename
                filename =f"processed_data_skipped.csv"

                # create save path
                skip_outpath = os.path.join(outpath_dir,filename)

                skipped = pd.DataFrame([csn,year]).T

                skipped.to_csv(skip_outpath, mode='a', header=not os.path.exists(skip_outpath), index= False)
    
    # Update the cns log
    TYPE='skipped'
    # set directory path
    outpath_dir = config['DIR']['data'].format(RUN=RUN, TYPE=TYPE)

    # set filename
    filename =f"processed_data_skipped.csv"

    path = os.path.join(outpath_dir,filename)
    skipped_df = pd.read_csv(path)
    drop_csns = skipped_df.iloc[:,0].unique().tolist()
    _ = cohort.remove_csn(X, drop_csns, step, 'corrupted file - did not process')

  0%|          | 0/5 [00:00<?, ?it/s]

Patient Data CSV 0 Complete
Starting Patient Data CSV 1


 20%|██        | 1/5 [24:43<1:38:52, 1483.11s/it]

Patient Data CSV 1 Complete
Starting Patient Data CSV 2


 40%|████      | 2/5 [50:15<1:15:37, 1512.37s/it]

Patient Data CSV 2 Complete
Starting Patient Data CSV 3


 60%|██████    | 3/5 [1:14:17<49:19, 1479.86s/it]

Patient Data CSV 3 Complete
Starting Patient Data CSV 4


 80%|████████  | 4/5 [1:44:46<26:57, 1617.94s/it]

Patient Data CSV 4 Complete
Starting Patient Data CSV 5


100%|██████████| 5/5 [1:58:54<00:00, 1426.98s/it]


AttributeError: module 'medicalbiasdetection.process' has no attribute 'remove_csn'

## Check CSN logs

In [12]:
_ = cohort.update_cohort(X,verbose=True)

Total CSNs included: 13289
Removed CSNs:
step           reason                          
preprocessing  non-ICU patient                     101935
               less than 24 hours of icu data        2616
               corrupted file - did not process      1890
               gender unknown                           3
dtype: int64
