In [1]:
# Data Manipulation
import pandas as pd
import numpy as np
import json

from tableone import TableOne

# Operating System
import os

# Convenience
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

from medicalbiasdetection import utils

# Configuration
pd.set_option('display.max_rows', 250)
pd.set_option('display.max_columns', 250)

# Global Variables
RUN = 3
RANDOM_STATE = 0

# setup configuration files
config = utils.read_yaml()
LOG_DIR = config['LOG']['dir'].format(RUN=RUN)
LOG_PATH = config['LOG']['path'].format(RUN=RUN)
os.environ['LOG_PATH'] = LOG_PATH
os.environ['RUN'] = str(RUN)


# # Create Run Directory
from medicalbiasdetection import (model as md, cohort, process)



# Load Reference Data

In [2]:
# identify the medical facility for the dataset
med_fac = 'grady' # 'grady' # 'emory'
X = cohort.load_reference_data(med_fac,config, verbose=True)


Number of encounters (csn): 119733
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 119733
Number of unique patients: 73484
Number of sepsis=0 patients: 101269 (84.58%)
Number of sepsis=1 patients: 18464 (15.42%)


## Update Reference Data

In [3]:
X = cohort.update_cohort(X, verbose=True)
cohort.print_cohort_report(X,'csn','pat_id','sepsis')

Total CSNs included: 15163
Removed CSNs:
step           reason                          
preprocessing  non-ICU patient                     101935
               less than 24 hours of icu data        2616
               corrupted file - did not process        16
               gender unknown                           3
dtype: int64
Number of encounters (csn): 15163
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 15163
Number of unique patients: 13899
Number of sepsis=1 patients: 6658 (43.91%)
Number of sepsis=0 patients: 8505 (56.09%)


## Load Hourly Data

In [12]:
df = cohort.load_hourly_data(config)


100%|██████████| 6/6 [00:40<00:00,  6.68s/it]


## Feature Analysis

In [19]:
 # get feature informative missing columns
fim_cols = config['preprocess']['fim_cols']
print(f"Feature Informative Missingness: {len(fim_cols)}")

#  get vital sign columns
vital_cols = config['preprocess']['vital_cols']
print(f"Vital Columns: {len(vital_cols)}")
# [print(x) for x in vital_cols]
# get time series columns
time_series_cols = config['preprocess']['time_series_cols']
print(f"All Features: {len(time_series_cols)}")
all_fim_cols = [x for x in df.columns if ("_f1" in x) or ("_f2" in x)]
print(f"FIM Columns: {len(all_fim_cols)}")
diff_cols = [x for x in df.columns if "_diff" in x]
print(f"Time series differenced: {len(diff_cols)}")
df.head()
agg = ["mean_", "median_", "min_", "max_", "std_", "dstd_"]

Feature Informative Missingness: 36
Vital Columns: 8
All Features: 41
FIM Columns: 72
Time series differenced: 41


Unnamed: 0,alanine_aminotransferase_(alt),albumin,alkaline_phosphatase,anion_gap,aspartate_aminotransferase_(ast),base_excess,best_map,bicarb_(hco3),bilirubin_total,blood_urea_nitrogen_(bun),calcium,chloride,creatinine,daily_weight_kg,fio2,gcs_total_score,glucose,hematocrit,hemoglobin,magnesium,partial_pressure_of_carbon_dioxide_(paco2),partial_pressure_of_oxygen_(pao2),pf_sp,ph,phosphorus,platelets,potassium,procedure,protein,pulse,sodium,spo2,unassisted_resp_rate,vent_status,white_blood_cell_count,temperature,sbp_cuff,dbp_cuff,map_cuff,csn,sepsis,sepsis_lag_6,sofa_score_total,sirs_score_total,alanine_aminotransferase_(alt)_interval_f1,alanine_aminotransferase_(alt)_interval_f2,albumin_interval_f1,albumin_interval_f2,alkaline_phosphatase_interval_f1,alkaline_phosphatase_interval_f2,anion_gap_interval_f1,anion_gap_interval_f2,aspartate_aminotransferase_(ast)_interval_f1,aspartate_aminotransferase_(ast)_interval_f2,base_excess_interval_f1,base_excess_interval_f2,bicarb_(hco3)_interval_f1,bicarb_(hco3)_interval_f2,bilirubin_total_interval_f1,bilirubin_total_interval_f2,blood_urea_nitrogen_(bun)_interval_f1,blood_urea_nitrogen_(bun)_interval_f2,calcium_interval_f1,calcium_interval_f2,chloride_interval_f1,chloride_interval_f2,creatinine_interval_f1,creatinine_interval_f2,daily_weight_kg_interval_f1,daily_weight_kg_interval_f2,fio2_interval_f1,fio2_interval_f2,gcs_total_score_interval_f1,gcs_total_score_interval_f2,glucose_interval_f1,glucose_interval_f2,hematocrit_interval_f1,hematocrit_interval_f2,hemoglobin_interval_f1,hemoglobin_interval_f2,magnesium_interval_f1,magnesium_interval_f2,partial_pressure_of_carbon_dioxide_(paco2)_interval_f1,partial_pressure_of_carbon_dioxide_(paco2)_interval_f2,partial_pressure_of_oxygen_(pao2)_interval_f1,partial_pressure_of_oxygen_(pao2)_interval_f2,pf_sp_interval_f1,pf_sp_interval_f2,ph_interval_f1,ph_interval_f2,phosphorus_interval_f1,phosphorus_interval_f2,platelets_interval_f1,platelets_interval_f2,potassium_interval_f1,potassium_interval_f2,protein_interval_f1,protein_interval_f2,sodium_interval_f1,sodium_interval_f2,vent_status_interval_f1,vent_status_interval_f2,white_blood_cell_count_interval_f1,white_blood_cell_count_interval_f2,sofa_score_total_interval_f1,sofa_score_total_interval_f2,sirs_score_total_interval_f1,sirs_score_total_interval_f2,sbp_cuff_interval_f1,sbp_cuff_interval_f2,dbp_cuff_interval_f1,dbp_cuff_interval_f2,map_cuff_interval_f1,map_cuff_interval_f2,best_map_interval_f1,best_map_interval_f2,alanine_aminotransferase_(alt)_diff,albumin_diff,alkaline_phosphatase_diff,anion_gap_diff,aspartate_aminotransferase_(ast)_diff,base_excess_diff,best_map_diff,bicarb_(hco3)_diff,bilirubin_total_diff,blood_urea_nitrogen_(bun)_diff,calcium_diff,chloride_diff,creatinine_diff,daily_weight_kg_diff,fio2_diff,gcs_total_score_diff,glucose_diff,hematocrit_diff,hemoglobin_diff,magnesium_diff,partial_pressure_of_carbon_dioxide_(paco2)_diff,partial_pressure_of_oxygen_(pao2)_diff,pf_sp_diff,ph_diff,phosphorus_diff,platelets_diff,potassium_diff,procedure_diff,protein_diff,pulse_diff,sodium_diff,spo2_diff,unassisted_resp_rate_diff,vent_status_diff,white_blood_cell_count_diff,temperature_diff,sbp_cuff_diff,dbp_cuff_diff,map_cuff_diff,sofa_score_total_diff,sirs_score_total_diff,mean_pulse,mean_spo2,mean_unassisted_resp_rate,mean_temperature,mean_sbp_cuff,mean_dbp_cuff,mean_map_cuff,mean_best_map,median_pulse,median_spo2,median_unassisted_resp_rate,median_temperature,median_sbp_cuff,median_dbp_cuff,median_map_cuff,median_best_map,min_pulse,min_spo2,min_unassisted_resp_rate,min_temperature,min_sbp_cuff,min_dbp_cuff,min_map_cuff,min_best_map,max_pulse,max_spo2,max_unassisted_resp_rate,max_temperature,max_sbp_cuff,max_dbp_cuff,max_map_cuff,max_best_map,std_pulse,std_spo2,std_unassisted_resp_rate,std_temperature,std_sbp_cuff,std_dbp_cuff,std_map_cuff,std_best_map,dstd_pulse,dstd_spo2,dstd_unassisted_resp_rate,dstd_temperature,dstd_sbp_cuff,dstd_dbp_cuff,dstd_map_cuff,dstd_best_map,pulse_score,temperature_score,unassisted_resp_rate_score,map_cuff_score,creatinine_score,qsofa,platelets_score,bilirubin_score
0_0,9.0,3.3,44.0,3.0,17.0,11.9,95.0,43.0,0.3,22.0,7.7,98.0,0.7,61.0,0.4,11.0,103.0,31.4,11.4,1.6,89.0,75.0,249.166667,7.29,2.9,95.0,4.7,0,5.7,87.0,144.0,99.666667,13.5,1.0,5.3,99.5,138.5,78.0,95.0,1017284206,1,1.0,5.0,0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1,0.0,1.0,0.0,1,0.0,1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,87.0,99.666667,13.5,99.5,138.5,78.0,95.0,95.0,87.0,99.666667,13.5,99.5,138.5,78.0,95.0,95.0,87.0,99.666667,13.5,99.5,138.5,78.0,95.0,95.0,87.0,99.666667,13.5,99.5,138.5,78.0,95.0,95.0,,,,,,,,,,,,,,,,,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0
0_1,9.0,3.3,44.0,3.0,17.0,11.9,95.0,43.0,0.3,22.0,7.7,98.0,0.7,61.0,0.4,11.0,103.0,31.4,11.4,1.6,89.0,75.0,245.0,7.29,2.9,95.0,4.7,0,5.7,87.0,144.0,98.0,22.0,1.0,5.3,99.5,131.0,79.0,95.0,1017284206,1,1.0,5.0,1,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2,0.0,2.0,0.0,2,0.0,2,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.666667,8.5,0.0,0.0,0.0,-7.5,1.0,0.0,0.0,1.0,87.0,98.833333,17.75,99.5,134.75,78.5,95.0,95.0,87.0,98.833333,17.75,99.5,134.75,78.5,95.0,95.0,87.0,98.0,13.5,99.5,131.0,78.0,95.0,95.0,87.0,99.666667,22.0,99.5,138.5,79.0,95.0,95.0,0.0,1.178511,6.010408,0.0,5.303301,0.707107,0.0,0.0,0.0,1.178511,11.667262,0.0,6.717514,1.414214,2.828427,2.828427,0.0,2.0,2.0,0.0,0.0,0.0,2.0,0.0
0_2,9.0,3.3,44.0,3.0,17.0,14.3,91.0,43.0,0.3,22.0,7.7,98.0,0.7,61.0,0.4,11.0,103.0,31.4,11.5,1.6,119.0,69.0,245.0,7.21,2.9,95.0,4.7,0,5.7,87.0,144.0,98.0,14.0,1.0,5.3,99.5,133.0,78.0,91.0,1017284206,1,1.0,5.0,0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3,0.0,3.0,0.0,3,0.0,3,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,2.4,-4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,30.0,-6.0,0.0,-0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.0,0.0,0.0,0.0,2.0,-1.0,-4.0,0.0,-1.0,87.0,98.555556,16.5,99.5,134.166667,78.333333,93.666667,93.666667,87.0,98.0,14.0,99.5,133.0,78.0,95.0,95.0,87.0,98.0,13.5,99.5,131.0,78.0,91.0,91.0,87.0,99.666667,22.0,99.5,138.5,79.0,95.0,95.0,0.0,0.96225,4.769696,0.0,3.883727,0.57735,2.309401,2.309401,0.0,3.097191,8.251263,0.0,9.251126,2.0,3.511885,3.511885,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0
0_3,9.0,3.3,44.0,3.0,17.0,14.3,94.0,43.0,0.3,22.0,7.7,98.0,0.7,61.0,0.4,11.0,103.0,31.4,11.5,1.6,119.0,69.0,230.0,7.21,2.9,95.0,4.7,0,5.7,87.0,144.0,92.0,14.0,1.0,5.3,99.5,144.0,75.0,94.0,1017284206,1,1.0,5.0,0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4,0.0,4.0,0.0,4,0.0,4,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.0,0.0,0.0,0.0,0.0,11.0,-3.0,3.0,0.0,0.0,87.0,96.916667,15.875,99.5,136.625,77.5,93.75,93.75,87.0,98.0,14.0,99.5,135.75,78.0,94.5,94.5,87.0,92.0,13.5,99.5,131.0,75.0,91.0,91.0,87.0,99.666667,22.0,99.5,144.0,79.0,95.0,95.0,0.0,3.370625,4.09013,0.0,5.85057,1.732051,1.892969,1.892969,0.0,4.139914,6.737643,0.0,10.585958,4.320494,2.872281,2.872281,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0
0_4,9.0,3.3,44.0,3.0,17.0,14.3,94.0,43.0,0.3,22.0,7.7,98.0,0.7,61.0,0.4,11.0,103.0,31.4,11.5,1.6,119.0,69.0,240.0,7.21,2.9,95.0,4.7,0,5.7,87.0,144.0,96.0,14.0,1.0,5.3,99.5,131.0,82.0,94.0,1017284206,1,1.0,5.0,0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5,0.0,5.0,0.0,5,0.0,5,0.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,-13.0,7.0,0.0,0.0,0.0,87.0,96.733333,15.5,99.5,135.5,78.4,93.8,93.8,87.0,98.0,14.0,99.5,133.0,78.0,94.0,94.0,87.0,92.0,13.5,99.5,131.0,75.0,91.0,91.0,87.0,99.666667,22.0,99.5,144.0,82.0,95.0,95.0,0.0,2.947692,3.640055,0.0,5.656854,2.50998,1.643168,1.643168,0.0,3.585464,5.911853,0.089443,9.705668,5.176872,3.577709,3.577709,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0


## Create TableOne

In [5]:
# filter reference data
X = X[config['tableone']['keep_reference']]
# filter hourly data
df = df[config['tableone']['keep_hourly']]
df_agg = df.groupby('csn').mean().reset_index()
df_t1 = df_agg.merge(X, on='csn', copy=False)

# clean column values
df_t1['total_vent_days'] = df_t1['total_vent_days'].fillna(0.0)
df_t1['daily_weight_kg'] = df_t1['daily_weight_kg']/0.453592
df_t1['sepsis'] = df_t1['sepsis'].map({False:"Non-sepsis",True:"Sepsis"})
df_t1['gender'] = df_t1['gender'].map({0:'Male',1:'Female'})
df_t1 = df_t1.drop(columns=['csn'])

# define columns for table one
columns = config['tableone']['columns']
categorical = config['tableone']['categorical']
nonnormal = config['tableone']['nonnormal']
labels = config['tableone']['labels']
# set label to group columns in Table One
groupby = 'sepsis'

# create Table One
table_one = TableOne(df_t1, columns=columns, categorical=categorical, nonnormal=nonnormal, groupby=groupby,rename=labels, pval=True,missing=False)
print(table_one.tabulate(tablefmt='simple'))

# save TableOne
TYPE = 'tableOne'
path = config['DIR']['data'].format(RUN=RUN, TYPE=TYPE)
filename = 'tableOne.tex'
filepath = os.path.join(path,filename)

table_one.to_latex(filepath)

                                                     Overall           Non-sepsis        Sepsis            P-Value
---------------------------------------  ----------  ----------------  ----------------  ----------------  ---------
n                                                    15163             8505              6658
Age, median [Q1,Q3]                                  56.0 [40.0,66.0]  53.0 [36.0,64.0]  58.0 [46.0,68.0]  <0.001
Gender, n (%)                            Female      5382 (35.5)       2909 (34.2)       2473 (37.1)       <0.001
                                         Male        9781 (64.5)       5596 (65.8)       4185 (62.9)
Race, n (%)                              Asian       158 (1.0)         99 (1.2)          59 (0.9)          <0.001
                                         Black       10573 (69.7)      5631 (66.2)       4942 (74.2)
                                         Hispanic    659 (4.3)         387 (4.6)         272 (4.1)
                               