In [2]:
import os 
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [4]:
## preprocess --> demographics, comorbidities, ventilation, gcs 
folder = '../sepsis3_queries/data'

sepsis3 = pd.read_csv(f'{folder}/sepsis-df-3.csv')
ventilation = pd.read_csv(f'{folder}/ventilation-df-3.csv')
gcs = pd.read_csv(f'{folder}/gcs-df-3.csv')
comorbidities = pd.read_csv(f'{folder}/comorbidities-nodrg-df-3.csv', index_col=[0])

eq = pd.read_csv(f'{folder}/elixhauser_quan.csv')
eq = eq[eq.hadm_id.isin(sepsis3.hadm_id)]


In [5]:
sepsis3.columns

Index(['icustay_id', 'hadm_id', 'excluded', 'intime', 'outtime', 'dbsource',
       'suspected_infection_time_poe', 'suspected_infection_time_poe_days',
       'specimen_poe', 'positiveculture_poe', 'antibiotic_time_poe',
       'blood_culture_time', 'blood_culture_positive', 'age', 'gender',
       'is_male', 'ethnicity', 'race_white', 'race_black', 'race_hispanic',
       'race_other', 'metastatic_cancer', 'diabetes', 'height', 'weight',
       'bmi', 'first_service', 'hospital_expire_flag', 'thirtyday_expire_flag',
       'icu_los', 'hosp_los', 'sepsis_angus', 'sepsis_martin',
       'sepsis_explicit', 'septic_shock_explicit', 'severe_sepsis_explicit',
       'sepsis_nqf', 'sepsis_cdc', 'sepsis_cdc_simple', 'elixhauser_hospital',
       'vent', 'sofa', 'lods', 'sirs', 'qsofa', 'qsofa_sysbp_score',
       'qsofa_gcs_score', 'qsofa_resprate_score', 'composite_outcome',
       'blood culture', 'suspicion_poe', 'abx_poe', 'sepsis-3', 'sofa>=2'],
      dtype='object')

In [6]:
comorbidities.diabetes_uncomplicated.value_counts()

0    5748
1      36
Name: diabetes_uncomplicated, dtype: int64

In [7]:
sepsis_to_keep = sepsis3[['icustay_id', 'hadm_id','age', 'gender', 'ethnicity', 'vent']]
sepsis_to_keep

Unnamed: 0,icustay_id,hadm_id,age,gender,ethnicity,vent
0,200061,121149,45.7505,M,OTHER,0
1,200075,132255,83.6432,F,WHITE,1
2,200087,124231,64.3080,M,WHITE,1
3,200116,164386,56.5050,M,WHITE,0
4,200131,187834,69.3102,F,WHITE,1
...,...,...,...,...,...,...
5779,299930,175890,72.3904,F,UNABLE TO OBTAIN,0
5780,299948,183226,57.6494,F,WHITE,0
5781,299949,199962,22.9327,M,WHITE,1
5782,299950,110990,75.9332,F,ASIAN - ASIAN INDIAN,0


In [8]:
comorbidities.columns

Index(['subject_id', 'hadm_id', 'congestive_heart_failure',
       'cardiac_arrhythmias', 'valvular_disease', 'pulmonary_circulation',
       'peripheral_vascular', 'hypertension', 'paralysis',
       'other_neurological', 'chronic_pulmonary', 'diabetes_uncomplicated',
       'diabetes_complicated', 'hypothyroidism', 'renal_failure',
       'liver_disease', 'peptic_ulcer', 'aids', 'lymphoma',
       'metastatic_cancer', 'solid_tumor', 'rheumatoid_arthritis',
       'coagulopathy', 'obesity', 'weight_loss', 'fluid_electrolyte',
       'blood_loss_anemia', 'deficiency_anemias', 'alcohol_abuse',
       'drug_abuse', 'psychoses', 'depression'],
      dtype='object')

In [9]:
gcs

Unnamed: 0,subject_id,hadm_id,icustay_id,mingcs,gcsmotor,gcsverbal,gcseyes,endotrachflag
0,74282,121149,200061,15.0,6.0,5.0,4.0,0.0
1,67800,132255,200075,14.0,6.0,4.0,4.0,0.0
2,61871,124231,200087,15.0,6.0,5.0,4.0,0.0
3,61654,164386,200116,15.0,6.0,5.0,4.0,0.0
4,47250,187834,200131,10.0,6.0,1.0,3.0,0.0
...,...,...,...,...,...,...,...,...
5779,47967,175890,299930,8.0,5.0,1.0,2.0,0.0
5780,66696,183226,299948,15.0,6.0,5.0,4.0,0.0
5781,53283,199962,299949,15.0,4.0,0.0,1.0,1.0
5782,98649,110990,299950,14.0,6.0,5.0,3.0,0.0


In [10]:
gcs['gcs_response'] = gcs[['gcsmotor', 'gcsverbal', 'gcseyes']].sum(axis=1)
#Severe: GCS 3-8 
#Moderate: GCS 9-12
#Mild: GCS 13-15

gcs_response_binary = []
for row in gcs['gcs_response']:
    if 13<=row<= 15 :  gcs_response_binary.append('gcs_13_15')
    elif 9<=row<= 12:   gcs_response_binary.append('gcs_9_12')
    elif row<=8:   gcs_response_binary.append('gcs_<8')

gcs['response_scale'] = gcs_response_binary
gcs.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,mingcs,gcsmotor,gcsverbal,gcseyes,endotrachflag,gcs_response,response_scale
0,74282,121149,200061,15.0,6.0,5.0,4.0,0.0,15.0,gcs_13_15
1,67800,132255,200075,14.0,6.0,4.0,4.0,0.0,14.0,gcs_13_15
2,61871,124231,200087,15.0,6.0,5.0,4.0,0.0,15.0,gcs_13_15
3,61654,164386,200116,15.0,6.0,5.0,4.0,0.0,15.0,gcs_13_15
4,47250,187834,200131,10.0,6.0,1.0,3.0,0.0,10.0,gcs_9_12


In [11]:
result = pd.merge(sepsis_to_keep, comorbidities, on='hadm_id', how='left')
result

Unnamed: 0,icustay_id,hadm_id,age,gender,ethnicity,vent,subject_id,congestive_heart_failure,cardiac_arrhythmias,valvular_disease,...,coagulopathy,obesity,weight_loss,fluid_electrolyte,blood_loss_anemia,deficiency_anemias,alcohol_abuse,drug_abuse,psychoses,depression
0,200061,121149,45.7505,M,OTHER,0,74282,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,200075,132255,83.6432,F,WHITE,1,67800,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,200087,124231,64.3080,M,WHITE,1,61871,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,200116,164386,56.5050,M,WHITE,0,61654,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,200131,187834,69.3102,F,WHITE,1,47250,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5779,299930,175890,72.3904,F,UNABLE TO OBTAIN,0,47967,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5780,299948,183226,57.6494,F,WHITE,0,66696,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5781,299949,199962,22.9327,M,WHITE,1,53283,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5782,299950,110990,75.9332,F,ASIAN - ASIAN INDIAN,0,98649,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
static = pd.merge(result, gcs[['icustay_id','response_scale']], on='icustay_id', how='left')
#static.set_index(['subject_id', 'hadm_id', 'icustay_id'], inplace=True)
static

Unnamed: 0,icustay_id,hadm_id,age,gender,ethnicity,vent,subject_id,congestive_heart_failure,cardiac_arrhythmias,valvular_disease,...,obesity,weight_loss,fluid_electrolyte,blood_loss_anemia,deficiency_anemias,alcohol_abuse,drug_abuse,psychoses,depression,response_scale
0,200061,121149,45.7505,M,OTHER,0,74282,0,0,0,...,0,0,0,0,0,0,0,0,0,gcs_13_15
1,200075,132255,83.6432,F,WHITE,1,67800,0,0,0,...,0,0,0,0,0,0,0,0,0,gcs_13_15
2,200087,124231,64.3080,M,WHITE,1,61871,0,0,0,...,0,0,0,0,0,0,0,0,0,gcs_13_15
3,200116,164386,56.5050,M,WHITE,0,61654,0,0,0,...,0,0,0,0,0,0,0,0,0,gcs_13_15
4,200131,187834,69.3102,F,WHITE,1,47250,0,0,0,...,0,0,0,0,0,0,0,0,0,gcs_9_12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5779,299930,175890,72.3904,F,UNABLE TO OBTAIN,0,47967,0,0,0,...,0,0,0,0,0,0,0,0,0,gcs_<8
5780,299948,183226,57.6494,F,WHITE,0,66696,0,0,0,...,0,0,0,0,0,0,0,0,0,gcs_13_15
5781,299949,199962,22.9327,M,WHITE,1,53283,0,0,0,...,0,0,0,0,0,0,0,0,0,gcs_<8
5782,299950,110990,75.9332,F,ASIAN - ASIAN INDIAN,0,98649,0,0,0,...,0,0,0,0,0,0,0,0,0,gcs_13_15


In [13]:
static.congestive_heart_failure.value_counts()

0    5645
1     139
Name: congestive_heart_failure, dtype: int64

In [14]:
folder = 'data_stage_1'
static.to_csv(os.path.join(folder, 'static.csv'),sep=',',index=False)
