In [1]:
# Data Manipulation
import pandas as pd
import numpy as np
import json

from tableone import TableOne

# Operating System
import os

# Convenience
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

from medicalbiasdetection import utils

# Configuration
pd.set_option('display.max_rows', 250)
pd.set_option('display.max_columns', 250)

# Global Variables
RUN = 1
RANDOM_STATE = 0

# setup configuration files
config = utils.read_yaml()
LOG_DIR = config['LOG']['dir'].format(RUN=RUN)
LOG_PATH = config['LOG']['path'].format(RUN=RUN)
os.environ['LOG_PATH'] = LOG_PATH
os.environ['RUN'] = str(RUN)


# # Create Run Directory
from medicalbiasdetection import (model as md, cohort, process)



# Load Reference Data

In [2]:
# identify the medical facility for the dataset
med_fac = 'grady' # 'grady' # 'emory'
X = cohort.load_reference_data(med_fac,config, verbose=True)


Number of encounters (csn): 119733
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 119733
Number of unique patients: 73484
Number of sepsis=0 patients: 101269 (84.58%)
Number of sepsis=1 patients: 18464 (15.42%)


## Update Reference Data

In [3]:
X = cohort.update_cohort(X, verbose=True)
cohort.print_cohort_report(X,'csn','pat_id','sepsis')

Total CSNs included: 13289
Removed CSNs:
step           reason                          
preprocessing  non-ICU patient                     101935
               less than 24 hours of icu data        2616
               corrupted file - did not process      1890
               gender unknown                           3
dtype: int64
Number of encounters (csn): 13289
Years: [2016 2017 2018 2019 2020]
Start year: 2016
End year: 2020
Number of unique patient visits: 13289
Number of unique patients: 12387
Number of sepsis=1 patients: 4772 (35.91%)
Number of sepsis=0 patients: 8517 (64.09%)


## Load Hourly Data

In [4]:
# get directory
data_dir = config['DIR']['data'].format(RUN=RUN,TYPE='hourly')

# list of patient data files
files = [f for f in os.listdir(data_dir) if "processed_data_" in f]

data_arr = []
for file in tqdm(files, leave=True):
    path = os.path.join(data_dir,file)
    tmp = pd.read_csv(path,index_col=[0])
    data_arr.append(tmp)

# create dataframe
df = pd.concat(data_arr,axis=0)


100%|██████████| 5/5 [00:34<00:00,  6.82s/it]


## Load Hourly Data

In [4]:
df = cohort.load_hourly_data(config)

100%|██████████| 5/5 [00:34<00:00,  6.84s/it]


## Create TableOne

In [5]:
# filter reference data
X = X[config['tableone']['keep_reference']]
# filter hourly data
df = df[config['tableone']['keep_hourly']]
df_agg = df.groupby('csn').mean().reset_index()
df_t1 = df_agg.merge(X, on='csn', copy=False)

# clean column values
df_t1['total_vent_days'] = df_t1['total_vent_days'].fillna(0.0)
df_t1['daily_weight_kg'] = df_t1['daily_weight_kg']/0.453592
df_t1['sepsis'] = df_t1['sepsis'].map({False:"Non-sepsis",True:"Sepsis"})
df_t1['gender'] = df_t1['gender'].map({0:'Male',1:'Female'})
df_t1 = df_t1.drop(columns=['csn'])

# define columns for table one
columns = config['tableone']['columns']
categorical = config['tableone']['categorical']
nonnormal = config['tableone']['nonnormal']
labels = config['tableone']['labels']
# set label to group columns in Table One
groupby = 'sepsis'

# create Table One
table_one = TableOne(df_t1, columns=columns, categorical=categorical, nonnormal=nonnormal, groupby=groupby,rename=labels, pval=True,missing=False)
print(table_one.tabulate(tablefmt='simple'))

# save TableOne
TYPE = 'tableOne'
path = config['DIR']['data'].format(RUN=RUN, TYPE=TYPE)
filename = 'tableOne.tex'
filepath = os.path.join(path,filename)

table_one.to_latex(filepath)

                                                     Overall           Non-sepsis        Sepsis            P-Value
---------------------------------------  ----------  ----------------  ----------------  ----------------  ---------
n                                                    13289             8517              4772
Age, median [Q1,Q3]                                  55.0 [39.0,66.0]  53.0 [36.0,64.0]  58.0 [45.0,68.0]  <0.001
Gender, n (%)                            Female      4604 (34.6)       2912 (34.2)       1692 (35.5)       0.146
                                         Male        8685 (65.4)       5605 (65.8)       3080 (64.5)
Race, n (%)                              Asian       143 (1.1)         99 (1.2)          44 (0.9)          <0.001
                                         Black       9114 (68.6)       5640 (66.2)       3474 (72.8)
                                         Hispanic    588 (4.4)         388 (4.6)         200 (4.2)
                                