In [None]:
#%% Import Packages & paths
import numpy as np
import pandas as pd
from time import time
from pathlib import Path
import os
import matplotlib.pyplot as plt
import seaborn as sns
import time

# the path where all the eicu (raw) datasets are
eicu_path = "/content/drive/MyDrive/Del_Pred_eICU/24h model/dataset preparation/eicu/"
# the path where the additional files that have list of names regarding the features are
feature_path = "/content/drive/MyDrive/Del_Pred_eICU/24h model/dataset preparation/Features/"
# the path where the (complete) patientid list is
comp_path = "/content/drive/MyDrive/Del_Pred_eICU/24h model/dataset preparation/Dataset/"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#%% Load in data.
comp = pd.read_csv(comp_path+"complete_patientstayid_label_list.csv")
comp

Unnamed: 0,patientunitstayid,Class,final_offset
0,242154,1,5640.0
1,242290,0,1362.0
2,242474,0,807.0
3,242505,1,5900.0
4,242714,0,2712.0
...,...,...,...
27058,3347353,1,4.0
27059,3347960,1,651.0
27060,3349086,1,574.0
27061,3349342,1,10390.0


In [None]:
comp_copy = comp.copy()

## 1. Static Model

### StaticFeatures

- features that were static, resulting in just one value per ICU stay without further processing. These include demographic information like age or ethnicity, characteristics about each ICU or hospital stay, etc.
-  An age of >89 was converted to 90. Hospital traits were converted from categorical strings to categorical numbers

- at the timepoint of the admission
- Pull static features from eICU, that are used in the first 24 hour models.
Will pull from the Patient, Hospital, and ApachePatientResult

In [None]:
#%% Load in needed raw data.
pat = pd.read_csv(eicu_path+"patient.csv"
                  ,usecols=['patientunitstayid','age','gender','ethnicity', 'apacheadmissiondx',
                            'admissionheight','admissionweight','hospitalid',
                            # 'hospitaladmittime24','hospitaladmitoffset', 'unitadmittime24',
                            'hospitaladmitsource','unittype',
                            'unitadmitsource','unitvisitnumber','unitstaytype'])
hosp = pd.read_csv(eicu_path+"hospital.csv")
apache = pd.read_csv(eicu_path+"apachePatientResult.csv",
                     usecols=['patientunitstayid','apachescore','apacheversion'])

- dropped features:

1.   hospitaladmittime24','hospitaladmitoffset', 'unitadmittime24' columns are later dropped before the classification (in the reference code) + not the admission to the ICU but to the hospital
2.   hospital ids doesn't have interpretable meaning and the relevant features (eg. teachingstatus, region, numbedscategory) already have some information about the institution -> to be dropped after being used as a PK/FK


- all the features below are static for the 1st model, since the 1st model aims to predict the delirium at the point of the patients' admission to the ICU
- So it can include the information collected after the admission to the hospital - very early onset model

In [None]:
# exploration of some features
print(pat.unitadmitsource.value_counts()) # static
print("\n")
print(pat.unittype.value_counts()) # static
print("\n")
print(pat.unitvisitnumber.value_counts()) # static
print("\n")
print(pat.unitstaytype.value_counts()) # static

unitadmitsource
Emergency Department    89594
Floor                   24368
Operating Room          24305
ICU to SDU              13827
Direct Admit            12672
Recovery Room            7844
Acute Care/Floor         5604
Step-Down Unit (SDU)     5450
ICU                      5439
Other Hospital           4323
Other ICU                4264
PACU                     1714
Chest Pain Center         336
Observation                19
Other                      10
Name: count, dtype: int64


unittype
Med-Surg ICU    113222
MICU             17465
CCU-CTICU        15290
Neuro ICU        14451
Cardiac ICU      12467
SICU             12181
CSICU             9625
CTICU             6158
Name: count, dtype: int64


unitvisitnumber
1     158442
2      33734
3       5598
4       2050
5        566
6        265
7         97
8         50
9         20
10        13
11         9
12         5
13         3
15         2
16         2
14         1
17         1
18         1
Name: count, dtype: int64


unitsta

In [None]:
#%% Get apache scores.
apache = apache[apache['apacheversion']=='IVa']

static = pd.DataFrame(comp['patientunitstayid']).merge(apache,how='left',on='patientunitstayid')
static.drop(columns='apacheversion',inplace=True)

#%% Get patient info.
static = static.merge(pat,how='left',on='patientunitstayid')

#Convert age to numeric.
def age_to_nums(age):
    if age == '> 89':
        return 90
    else:
        return age

static['age'] = static['age'].apply(age_to_nums)

#%% Get hospital traits.
static = static.merge(hosp,how='left',on='hospitalid')

#Convert hospital bed size to numerical categories.
#Since it has some quantitative meaning, should be label-coded instead of dummy coded
def beds_to_nums(numbedscategory):
    if numbedscategory == '<100':
        return 1
    elif numbedscategory == '100 - 249':
        return 2
    elif numbedscategory == '250 - 499':
        return 3
    elif numbedscategory == '>= 500':
        return 4
    else:
        return np.nan

static['numbedscategory'] = static['numbedscategory'].apply(beds_to_nums)

#Convert teaching status to boolean value
def teach_to_tf(teachingstatus):
    if teachingstatus == 't':
        return True
    elif teachingstatus == 'f':
        return False
    else:
        return np.nan

static['teachingstatus'] = static['teachingstatus'].apply(teach_to_tf)

"""
deleted the part that makes region into the numerical category,
since it should be dummy coded in the preprocessing phase.
"""

'\ndeleted the part that makes region into the numerical category,\nsince it should be dummy coded in the preprocessing phase.\n'

In [None]:
static.drop('hospitalid',axis=1,inplace=True)
static['age'] = static.age.astype(float)
print(static.columns)
print(static.shape)
static.head()

Index(['patientunitstayid', 'apachescore', 'gender', 'age', 'ethnicity',
       'apacheadmissiondx', 'admissionheight', 'hospitaladmitsource',
       'unittype', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype',
       'admissionweight', 'numbedscategory', 'teachingstatus', 'region'],
      dtype='object')
(27063, 16)


Unnamed: 0,patientunitstayid,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,unittype,unitadmitsource,unitvisitnumber,unitstaytype,admissionweight,numbedscategory,teachingstatus,region
0,242154,77.0,Female,46,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,Med-Surg ICU,Direct Admit,1,admit,53.09,4.0,False,Midwest
1,242290,77.0,Female,75,Other/Unknown,"Sepsis, unknown",165.1,Direct Admit,Med-Surg ICU,Direct Admit,1,admit,73.3,4.0,False,Midwest
2,242474,64.0,Male,52,Native American,Drug withdrawal,177.8,,Med-Surg ICU,Emergency Department,1,admit,94.0,4.0,False,Midwest
3,242505,78.0,Female,90,Caucasian,"Sepsis, cutaneous/soft tissue",165.1,Emergency Department,Med-Surg ICU,Emergency Department,1,admit,61.8,4.0,False,Midwest
4,242714,-1.0,Male,78,Caucasian,"Hematoma, subdural",182.9,Direct Admit,Med-Surg ICU,Direct Admit,1,admit,89.2,4.0,False,Midwest


In [None]:
static.dtypes

patientunitstayid        int64
apachescore            float64
gender                  object
age                    float64
ethnicity               object
apacheadmissiondx       object
admissionheight        float64
hospitaladmitsource     object
unittype                object
unitadmitsource         object
unitvisitnumber          int64
unitstaytype            object
admissionweight        float64
numbedscategory        float64
teachingstatus            bool
region                  object
dtype: object

In [None]:
# adding static features to the main dataframe
merged_df = pd.merge(comp, static, on='patientunitstayid', how='left')
print(merged_df.shape)
merged_df.head()

(27063, 18)


Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,unittype,unitadmitsource,unitvisitnumber,unitstaytype,admissionweight,numbedscategory,teachingstatus,region
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,Med-Surg ICU,Direct Admit,1,admit,53.09,4.0,False,Midwest
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.1,Direct Admit,Med-Surg ICU,Direct Admit,1,admit,73.3,4.0,False,Midwest
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.8,,Med-Surg ICU,Emergency Department,1,admit,94.0,4.0,False,Midwest
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.1,Emergency Department,Med-Surg ICU,Emergency Department,1,admit,61.8,4.0,False,Midwest
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.9,Direct Admit,Med-Surg ICU,Direct Admit,1,admit,89.2,4.0,False,Midwest


### HistoryFeatures

- Pulls whether history was marked for a patient, for each different history option in eICU's pasthistory table.

- Our clinician (the reference paper's) advisor grouped the history descriptions together, which were used to create history categories.

- The end results flags each patientunitstayid as True or False for certain histories.

In [None]:
#%% Inputs
# read in lists of history paths, and names of lists
paths = pd.read_csv(feature_path+"HistoryFeatureLists.csv")
pathlistlist = paths.values.astype(str).tolist()

names = pd.read_csv(feature_path+"HistoryListNames.csv")
nameslist = names.values.astype(str).tolist()
nameslist = [item for sublist in nameslist for item in sublist]
print(nameslist)

['AICD', 'Angina', 'Arrythmia', 'CHF', 'CABG', 'Hypertension', 'MI', 'Pacemaker', 'PVD', 'PCI', 'PulmEmb', 'HeartTransp', 'ValveDis', 'VenThromb', 'Cushing', 'Hypercalcemia', 'hyperthyroid', 'hypothyroid', 'diabetes', 'Steroid Use', 'Cirrhosis', 'Hypersplenism', 'PUD', 'LiverTransp', 'AplasticAnemia', 'Chemotherapy', 'RadiationTherapy', 'Cancer', 'ClottingDisorder', 'HemolyticAnemia', 'HypercoagulableCondition', 'Oncology', 'MyeloproliferativeDisease', 'SickleCellDisease', 'ImmuneSuppr', 'Dementia', 'IntracranialMass', 'NeuromuscularDis', 'Seizures', 'Strokes', 'TIA', 'Asthma', 'COPD', 'RespiratoryFail', 'RestrictiveDis', 'LungTransp', 'Sarcoidosis', 'StoneDisease', 'NeurogenicBladd', 'RenalFail', 'RenalInsuff', 'RTA', 'RenalTransp', 'Rheumatic']


In [None]:
# import in all history data
hist = pd.read_csv(eicu_path+"pastHistory.csv")
hist = hist.drop(columns=['pasthistoryid','pasthistoryoffset',
                          'pasthistoryenteredoffset','pasthistorynotetype',
                          'pasthistoryvalue','pasthistoryvaluetext'])
hist.head()

Unnamed: 0,patientunitstayid,pasthistorypath
0,141168,notes/Progress Notes/Past History/Organ System...
1,141168,notes/Progress Notes/Past History/Organ System...
2,141168,notes/Progress Notes/Past History/Organ System...
3,141168,notes/Progress Notes/Past History/Past History...
4,141168,notes/Progress Notes/Past History/Organ System...


In [None]:
#%% only keep data with relevant patient unit stay ids
complist = comp.patientunitstayid.tolist()
compHist = hist[hist['patientunitstayid'].isin(complist)]
print(len(complist))
print(len(compHist))

27063
185213


In [None]:
#%% for each path list, check if there are rows for it. If there are, mark it as such.
features = pd.DataFrame(comp['patientunitstayid'])

#TO DO make this so it simultaneously loops through both, not nested.
for counter in range(0,len(nameslist)):
    # keep rows with relevant paths
    tempHist = compHist[compHist['pasthistorypath'].isin(pathlistlist[counter])]
    tempHist = tempHist.drop(columns=['pasthistorypath'])
    tempHist = tempHist.drop_duplicates()
    tempHistList = tempHist.values.astype(str).tolist()
    tempHistList = [item for sublist in tempHistList for item in sublist]
    features[nameslist[counter]] = (features['patientunitstayid'].isin(tempHistList)).to_frame()

print(features.shape)
features.head()

(27063, 55)


Unnamed: 0,patientunitstayid,AICD,Angina,Arrythmia,CHF,CABG,Hypertension,MI,Pacemaker,PVD,...,RestrictiveDis,LungTransp,Sarcoidosis,StoneDisease,NeurogenicBladd,RenalFail,RenalInsuff,RTA,RenalTransp,Rheumatic
0,242154,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,242290,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,242474,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,242505,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,242714,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
features = features.add_prefix("history_")
features.rename(columns={"history_patientunitstayid" : "patientunitstayid"}, inplace=True)
features.columns

Index(['patientunitstayid', 'history_AICD', 'history_Angina',
       'history_Arrythmia', 'history_CHF', 'history_CABG',
       'history_Hypertension', 'history_MI', 'history_Pacemaker',
       'history_PVD', 'history_PCI', 'history_PulmEmb', 'history_HeartTransp',
       'history_ValveDis', 'history_VenThromb', 'history_Cushing',
       'history_Hypercalcemia', 'history_hyperthyroid', 'history_hypothyroid',
       'history_diabetes', 'history_Steroid Use', 'history_Cirrhosis',
       'history_Hypersplenism', 'history_PUD', 'history_LiverTransp',
       'history_AplasticAnemia', 'history_Chemotherapy',
       'history_RadiationTherapy', 'history_Cancer',
       'history_ClottingDisorder', 'history_HemolyticAnemia',
       'history_HypercoagulableCondition', 'history_Oncology',
       'history_MyeloproliferativeDisease', 'history_SickleCellDisease',
       'history_ImmuneSuppr', 'history_Dementia', 'history_IntracranialMass',
       'history_NeuromuscularDis', 'history_Seizures', 'histo

In [None]:
merged_df = pd.merge(merged_df, features, on='patientunitstayid', how='left')
merged_df.columns

Index(['patientunitstayid', 'Class', 'final_offset', 'apachescore', 'gender',
       'age', 'ethnicity', 'apacheadmissiondx', 'admissionheight',
       'hospitaladmitsource', 'unittype', 'unitadmitsource', 'unitvisitnumber',
       'unitstaytype', 'admissionweight', 'numbedscategory', 'teachingstatus',
       'region', 'history_AICD', 'history_Angina', 'history_Arrythmia',
       'history_CHF', 'history_CABG', 'history_Hypertension', 'history_MI',
       'history_Pacemaker', 'history_PVD', 'history_PCI', 'history_PulmEmb',
       'history_HeartTransp', 'history_ValveDis', 'history_VenThromb',
       'history_Cushing', 'history_Hypercalcemia', 'history_hyperthyroid',
       'history_hypothyroid', 'history_diabetes', 'history_Steroid Use',
       'history_Cirrhosis', 'history_Hypersplenism', 'history_PUD',
       'history_LiverTransp', 'history_AplasticAnemia', 'history_Chemotherapy',
       'history_RadiationTherapy', 'history_Cancer',
       'history_ClottingDisorder', 'history_Hemolyt

In [None]:
print(merged_df.shape)
merged_df.head()

(27063, 72)


Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,history_RestrictiveDis,history_LungTransp,history_Sarcoidosis,history_StoneDisease,history_NeurogenicBladd,history_RenalFail,history_RenalInsuff,history_RTA,history_RenalTransp,history_Rheumatic
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,False,False,False,False,False,False,False,False,False,False
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.1,Direct Admit,...,False,False,False,False,False,False,False,False,False,False
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.8,,...,False,False,False,False,False,False,False,False,False,False
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.1,Emergency Department,...,False,False,False,False,False,False,False,False,False,False
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.9,Direct Admit,...,False,False,False,False,False,False,False,False,False,False


### AdmissionDx

In [None]:
# import data
admissiondx = pd.read_csv(eicu_path+"admissionDx.csv") # patient -> admissiondx로 파일명 수정
admissiondx = admissiondx.sort_values(["patientunitstayid", "admissiondxid"])
admissiondx.head()

Unnamed: 0,admissiondxid,patientunitstayid,admitdxenteredoffset,admitdxpath,admitdxname,admitdxtext
116971,467412,141168,61,admission diagnosis|Was the patient admitted f...,No,No
116972,467413,141168,61,admission diagnosis|Non-operative Organ System...,Cardiovascular,Cardiovascular
116970,467414,141168,61,admission diagnosis|All Diagnosis|Non-operativ...,"Rhythm disturbance (atrial, supraventricular)","Rhythm disturbance (atrial, supraventricular)"
116975,744543,141194,671,admission diagnosis|Was the patient admitted f...,No,No
116974,744544,141194,671,admission diagnosis|Non-operative Organ System...,Cardiovascular,Cardiovascular


#### UrgentAdmission

In [None]:
# Determines if an admission is elective or not
def determine_urgency(string):
    if string == 'admission diagnosis|Elective|Yes':
        return 0
    else:
        return 1

# Determines precedence for multiple admission diagnoses.
def precedence(array):
    if 0 in array:
        return 0
    else:
        return 1

In [None]:
admissiondx['urgent'] = admissiondx.apply(lambda row: determine_urgency(row.admitdxpath), axis = 1)

temp_dict = {}

for ind in admissiondx.index:
    if (admissiondx['patientunitstayid'][ind] in temp_dict):
        temp_dict[admissiondx['patientunitstayid'][ind]].append(admissiondx['urgent'][ind])
    else:
        temp_dict[admissiondx['patientunitstayid'][ind]] = [admissiondx['urgent'][ind]]

for unitid in temp_dict:
    temp = precedence(temp_dict[unitid])
    temp_dict[unitid] = temp

In [None]:
urgent_feature = pd.DataFrame.from_dict(temp_dict, orient='index', columns = ['urgentadmission'])
print(urgent_feature.value_counts(), "\n")
urgent_feature['patientunitstayid'] = urgent_feature.index
urgent_feature.reset_index(drop=True, inplace=True)
urgent_feature

urgentadmission
1                  147286
0                   30577
Name: count, dtype: int64 



Unnamed: 0,urgentadmission,patientunitstayid
0,1,141168
1,1,141194
2,1,141197
3,1,141203
4,1,141208
...,...,...
177858,1,3353235
177859,1,3353237
177860,1,3353251
177861,1,3353254


In [None]:
merged_df = pd.merge(merged_df, urgent_feature, on='patientunitstayid', how='left')
print(merged_df.shape)
merged_df.head()

(27063, 73)


Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,history_LungTransp,history_Sarcoidosis,history_StoneDisease,history_NeurogenicBladd,history_RenalFail,history_RenalInsuff,history_RTA,history_RenalTransp,history_Rheumatic,urgentadmission
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,False,False,False,False,False,False,False,False,False,1.0
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.1,Direct Admit,...,False,False,False,False,False,False,False,False,False,1.0
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.8,,...,False,False,False,False,False,False,False,False,False,1.0
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.1,Emergency Department,...,False,False,False,False,False,False,False,False,False,1.0
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.9,Direct Admit,...,False,False,False,False,False,False,False,False,False,1.0


In [None]:
merged_df_copy = merged_df.copy()

#### AdmissionDiagnosis

In [None]:
#%% Sort admission data and removes unnecessary parts of diagnosis path
admissiondx['strippeddxpath'] = admissiondx.apply(lambda row: row.admitdxpath[20 :], axis = 1)
admissiondx['strippeddxpath']

116971    Was the patient admitted from the O.R. or went...
116972    Non-operative Organ Systems|Organ System|Cardi...
116970    All Diagnosis|Non-operative|Diagnosis|Cardiova...
116975    Was the patient admitted from the O.R. or went...
116974    Non-operative Organ Systems|Organ System|Cardi...
                                ...                        
116964    Non-operative Organ Systems|Organ System|Gastr...
116966    All Diagnosis|Non-operative|Diagnosis|Gastroin...
116968    Was the patient admitted from the O.R. or went...
116967    Non-operative Organ Systems|Organ System|Respi...
116969    All Diagnosis|Non-operative|Diagnosis|Respirat...
Name: strippeddxpath, Length: 626858, dtype: object

In [None]:
# Convert diagnosis string category to one of 5 categories
def determine_class(string):
    if ('Neurology' in string) or ('Neurologic' in string):
        return "Neurology/neurosurgery"
    elif len(string) > 12 and string[:13] == 'All Diagnosis':
        if string[14:24] == 'Operative':
            return "Surgery"
        else:
            if ('Trauma' in string):
                return "Trauma"
            else:
                return "Medical"
    elif ('Non-operative Organ Systems' in string):
        if ('Trauma' in string):
            return "Trauma"
        else:
            return "Medical"
    elif ('Operative Organ Systems' in string):
        return "Surgery"

In [None]:
# Determine diagnosis class for each row
admissiondx['diagnosisclass'] = admissiondx['strippeddxpath'].apply(determine_class)

# Apply one-hot encoding to diagnosis class
diagnosis_classes = pd.get_dummies(admissiondx['diagnosisclass'])

# Concatenate one-hot encoded diagnosis classes with patientunitstayid
adm_dx = pd.concat([admissiondx['patientunitstayid'], diagnosis_classes], axis=1).groupby('patientunitstayid').max().reset_index()

adm_dx = adm_dx.add_prefix("Dx_")
adm_dx.rename(columns={"Dx_patientunitstayid" : "patientunitstayid"}, inplace=True)
adm_dx

Unnamed: 0,patientunitstayid,Dx_Medical,Dx_Neurology/neurosurgery,Dx_Surgery,Dx_Trauma
0,141168,True,False,False,False
1,141194,True,False,False,False
2,141197,True,False,False,False
3,141203,True,False,False,False
4,141208,False,True,False,False
...,...,...,...,...,...
177858,3353235,True,False,False,False
177859,3353237,True,False,False,False
177860,3353251,True,False,False,False
177861,3353254,True,False,False,False


- The reference code: The precedence function is designed to determine the precedence of categories if a patient has multiple diagnosis categories. It takes an array of categories as input and returns the highest precedence category based on the predefined order: Neurology/neurosurgery (4), Surgery (1), Trauma (3), Medical (2), and Unknown (0).
- changed it into the object variable so that it can be one-hot encoded on the preprocessing stage and for each category the patient id will have 0/1 value.
- the model will analyze whether each categories have importance and how important they are

In [None]:
merged_df = pd.merge(merged_df, adm_dx, on='patientunitstayid', how='left')
print(merged_df.shape)
merged_df.head()

(27063, 77)


Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,history_RenalFail,history_RenalInsuff,history_RTA,history_RenalTransp,history_Rheumatic,urgentadmission,Dx_Medical,Dx_Neurology/neurosurgery,Dx_Surgery,Dx_Trauma
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,False,False,False,False,False,1.0,True,False,False,False
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.1,Direct Admit,...,False,False,False,False,False,1.0,True,False,False,False
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.8,,...,False,False,False,False,False,1.0,False,True,False,False
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.1,Emergency Department,...,False,False,False,False,False,1.0,True,False,False,False
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.9,Direct Admit,...,False,False,False,False,False,1.0,False,True,False,False


### Final feature list + df extraction

In [None]:
# save off the dataframe in case you specifically need to extract only the static features
# merged_df.to_csv(feature_path+'merged_features_static.csv',index=False)

# check if the dataframe has been successfully saved off
# merged_df = pd.read_csv(feature_path+"merged_features_static.csv")
# merged_df

In [None]:
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,history_RenalFail,history_RenalInsuff,history_RTA,history_RenalTransp,history_Rheumatic,urgentadmission,Dx_Medical,Dx_Neurology/neurosurgery,Dx_Surgery,Dx_Trauma
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,False,False,False,False,False,1.0,True,False,False,False
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,False,False,False,False,False,1.0,True,False,False,False
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,False,False,False,False,False,1.0,False,True,False,False
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,False,False,False,False,False,1.0,True,False,False,False
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,False,False,False,False,False,1.0,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,False,False,False,False,False,1.0,True,False,False,False
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,False,False,False,False,False,1.0,False,False,False,True
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,False,False,False,False,False,1.0,False,True,False,False
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,False,False,False,False,False,1.0,False,False,False,True


In [None]:
# save off the list of column names
pd.DataFrame(merged_df.columns.to_list()).to_csv(feature_path+'featurelist_static.csv',index=False)

- **note**: additional columns regarding the `medication` and `comorbidity` should be merged in a separate notebooks

## 2. Dynamic Model

- should extract the features ADDING to the all static feature columns

### AKI_24hr

- pulls whether a patient had AKI, by searching diagnosis strings that included acute renal failure, or traumatic renal injury.

In [None]:
#Only keep the stays that had delirium testing.
diag = pd.read_csv(eicu_path+"diagnosis.csv", usecols=['patientunitstayid','diagnosisoffset','diagnosisstring','icd9code'])
diag = diag[diag['patientunitstayid'].isin(comp['patientunitstayid'])]

print("~24hr:",diag[diag['diagnosisoffset']<=1440].shape[0])
print("~admission:",diag[diag['diagnosisoffset']<0].shape[0])

~24hr: 156989
~admission: 2441


In [None]:
#Only keep diagnoses done in the first 24 hours of the ICU stay or earlier.
diag = diag[diag['diagnosisoffset']<=1440] # reference

#Make it all lowercase
diag = diag.applymap(lambda s:s.lower() if type(s) == str else s)
diag

Unnamed: 0,patientunitstayid,diagnosisoffset,diagnosisstring,icd9code
73403,242154,204,pulmonary|respiratory failure|acute respirator...,"518.81, j96.00"
73405,242154,204,pulmonary|pulmonary infections|pneumonia|aspir...,"507.0, j69.0"
73413,242154,687,pulmonary|respiratory failure|acute respirator...,"518.81, j96.00"
73414,242154,687,pulmonary|pulmonary infections|pneumonia|aspir...,"507.0, j69.0"
73421,242290,165,infectious diseases|systemic/other infections|...,995.90
...,...,...,...,...
2710066,3353077,95,neurologic|cns mass lesions|cerebral mass of u...,784.2
2710067,3353077,1404,neurologic|misc|headache,r51
2710071,3353077,1404,neurologic|misc|headache,r51
2710074,3353077,1404,neurologic|cns mass lesions|brain abscess,"324.0, g06.0"


In [None]:
#Only keep AKI related diagnoses
search_terms_list = ['acute renal failure','traumatic renal injury'] #Tried aki, got nothing.
diag = diag[diag['diagnosisstring'].str.contains('|'.join(search_terms_list), na=False)]

#Just get the stay IDs
diag = diag[['patientunitstayid']]
diag.drop_duplicates(inplace=True)

In [None]:
#Create a column for the feature
merged_df['24hr_AKI'] = merged_df['patientunitstayid'].isin(diag['patientunitstayid'])
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,history_RenalInsuff,history_RTA,history_RenalTransp,history_Rheumatic,urgentadmission,Dx_Medical,Dx_Neurology/neurosurgery,Dx_Surgery,Dx_Trauma,24hr_AKI
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,False,False,False,False,1.0,True,False,False,False,False
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,False,False,False,False,1.0,True,False,False,False,True
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,False,False,False,False,1.0,False,True,False,False,False
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,False,False,False,False,1.0,True,False,False,False,False
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,False,False,False,False,1.0,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,False,False,False,False,1.0,True,False,False,False,False
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,False,False,False,False,1.0,False,False,False,True,False
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,False,False,False,False,1.0,False,True,False,False,False
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,False,False,False,False,1.0,False,False,False,True,False


### First_24hr_Surgical_Patients

In [None]:
#%% Load in data.
adm_dx = pd.read_csv(eicu_path+"admissionDx.csv",
                     usecols=['patientunitstayid', 'admitdxenteredoffset',
                              'admitdxpath','admitdxname'])
apache = pd.read_csv(eicu_path+"apachePredVar.csv",
                     usecols=['patientunitstayid', 'admitdiagnosis',
                              'electivesurgery', 'admitsource'])
treat = pd.read_csv(eicu_path+"treatment.csv",
                    usecols=['patientunitstayid', 'treatmentoffset',
                             'treatmentstring'])

In [None]:
#treat df
#Only keep the stays that had delirium testing.
treat = treat[treat['patientunitstayid'].isin(comp['patientunitstayid'])]

print("~24hr:",treat[treat['treatmentoffset']<=1440].shape[0])
print("~admission:",treat[treat['treatmentoffset']<0].shape[0])

~24hr: 240989
~admission: 3351


In [None]:
#%% Remove irrelevant patients, and treatments after 24 hrs.
for data in [adm_dx,apache,treat]:
    drop_index = data[~data['patientunitstayid'].isin(comp['patientunitstayid'])].index
    data.drop(drop_index, inplace=True)

treat = treat[treat['treatmentoffset']<=1440]

#%% Identify surgical patients in AdmissionDx/ApachePredVar
all_dxs = adm_dx[['admitdxpath']].drop_duplicates().sort_values('admitdxpath')

#Keyword searching through all paths.
operative = all_dxs[all_dxs['admitdxpath'].str.contains("\\|Operative")]
oper_room = all_dxs[all_dxs['admitdxpath'].str.contains("O\.R\.")]
elective = all_dxs[all_dxs['admitdxpath'].str.contains("Elective")]

#Find all patients that had operative dxs.
op_dx_pats = adm_dx[adm_dx['admitdxpath'].isin(operative['admitdxpath'])]
op_dx_pats = op_dx_pats[['patientunitstayid']].drop_duplicates()

#Get all rows that had operation diagnoses of some sort.
op_dx_pats_info = adm_dx[adm_dx['patientunitstayid'].isin(
    op_dx_pats['patientunitstayid'])]

#Only keep patients that had S- prefixes in admit diagnosis.
s_pats_info = apache[apache['admitdiagnosis'].str.contains('S-',na=False)]
s_pats = s_pats_info[['patientunitstayid']]

#Combine the info together.
admdx_apache = op_dx_pats.merge(s_pats,on='patientunitstayid',how='outer')

"""
#%% Identify surgical patients from treatment.

#Using Youn-hoa's manual review document to filter out non-surgical stuff.
review = pd.read_excel('relevant_treatment_descriptions_yhj.xls',
                       usecols=['treatmentstring','Surgery?'])
relevant_treat_str = review[review['Surgery?']==1]['treatmentstring']
treat_pats = treat[treat['treatmentstring'].isin(relevant_treat_str)]
treat_pats = treat_pats[['patientunitstayid']]

#Combine with admdx and apache info.
all_three = admdx_apache.merge(treat_pats,on='patientunitstayid',how='outer')
all_three = all_three['patientunitstayid'].drop_duplicates()
"""

"\n#%% Identify surgical patients from treatment.\n\n#Using Youn-hoa's manual review document to filter out non-surgical stuff.\nreview = pd.read_excel('relevant_treatment_descriptions_yhj.xls',\n                       usecols=['treatmentstring','Surgery?'])\nrelevant_treat_str = review[review['Surgery?']==1]['treatmentstring']\ntreat_pats = treat[treat['treatmentstring'].isin(relevant_treat_str)]\ntreat_pats = treat_pats[['patientunitstayid']]\n\n#Combine with admdx and apache info.\nall_three = admdx_apache.merge(treat_pats,on='patientunitstayid',how='outer')\nall_three = all_three['patientunitstayid'].drop_duplicates()\n"

- couldn't filter out the non-surgical stuff since we don't have the access to the separate xls file
  - 19/04/2024 found the xls file but have not applied it in the current code

In [None]:
admdx_apache

Unnamed: 0,patientunitstayid
0,2902236
1,2902358
2,2902844
3,2902876
4,2903085
...,...
4182,1058356
4183,1453465
4184,1614188
4185,1629039


In [None]:
merged_df['24hr_surgical'] = merged_df['patientunitstayid'].isin(admdx_apache).astype(int)
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,history_RTA,history_RenalTransp,history_Rheumatic,urgentadmission,Dx_Medical,Dx_Neurology/neurosurgery,Dx_Surgery,Dx_Trauma,24hr_AKI,24hr_surgical
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,False,False,False,1.0,True,False,False,False,False,0
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,False,False,False,1.0,True,False,False,False,True,0
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,False,False,False,1.0,False,True,False,False,False,0
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,False,False,False,1.0,True,False,False,False,False,0
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,False,False,False,1.0,False,True,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,False,False,False,1.0,True,False,False,False,False,0
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,False,False,False,1.0,False,False,False,True,False,0
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,False,False,False,1.0,False,True,False,False,False,0
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,False,False,False,1.0,False,False,False,True,False,0


### First24hrDialysis

In [None]:
cpl = pd.read_csv(eicu_path+"carePlanGeneral.csv",
                  usecols=['patientunitstayid','cplitemoffset','cplitemvalue'])

apache = pd.read_csv(eicu_path+"apacheApsVar.csv",
                     usecols=['patientunitstayid','dialysis'])

treat = pd.read_csv(eicu_path+"treatment.csv",
                    usecols=['patientunitstayid', 'treatmentoffset', 'treatmentstring'])

In [None]:
apache.dialysis.value_counts()

dialysis
0    164868
1      6309
Name: count, dtype: int64

In [None]:
treat.head()

Unnamed: 0,patientunitstayid,treatmentoffset,treatmentstring
0,242040,198,cardiovascular|hypertension|angiotensin II rec...
1,242040,198,cardiovascular|myocardial ischemia / infarctio...
2,242040,198,infectious diseases|medications|therapeutic an...
3,242040,616,cardiovascular|non-operative procedures|diagno...
4,242040,618,infectious diseases|medications|therapeutic an...


In [None]:
#Only keep the stays that had delirium testing.
cpl = cpl[cpl['patientunitstayid'].isin(comp['patientunitstayid'])]
treat = treat[treat['patientunitstayid'].isin(comp['patientunitstayid'])]

print("cpl")
print("~24hr:",cpl[cpl['cplitemoffset']<=1440].shape[0])
print("~admission:",cpl[cpl['cplitemoffset']<0].shape[0])
print("\n","treat")
print("~24hr:",treat[treat['treatmentoffset']<=1440].shape[0])
print("~admission:",treat[treat['treatmentoffset']<0].shape[0])

cpl
~24hr: 336880
~admission: 10425

 treat
~24hr: 240989
~admission: 3351


In [None]:
#%% Filter out irrelevant rows.

#Just get data for our patient stays.
for data in [cpl,apache,treat]:
    drop_index = data[~data['patientunitstayid'].isin(
        comp['patientunitstayid'])].index
    data.drop(drop_index, inplace=True)

cpl = cpl[(cpl['cplitemoffset']<=1440)]
cpl = cpl[cpl['cplitemvalue']=='Dialysis']
dialysis = cpl['patientunitstayid']

apache = apache[apache['dialysis']==1]
dialysis = pd.concat([dialysis, apache['patientunitstayid']])

treat = treat[(treat['treatmentoffset']<=1440)]
treat = treat[treat['treatmentstring'].str.contains('dialysis')]
dialysis = pd.concat([dialysis, treat['patientunitstayid']])

dialysis.drop_duplicates(inplace=True)
dialysis.reset_index(inplace=True, drop=True)
dialysis

0        980561
1        996955
2       1010141
3       1041547
4       1054485
         ...   
1363    3193826
1364    3195123
1365    3333989
1366    3337257
1367    3340068
Name: patientunitstayid, Length: 1368, dtype: int64

In [None]:
merged_df['24hr_dialysis'] = merged_df['patientunitstayid'].isin(dialysis).astype(int)
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,history_RenalTransp,history_Rheumatic,urgentadmission,Dx_Medical,Dx_Neurology/neurosurgery,Dx_Surgery,Dx_Trauma,24hr_AKI,24hr_surgical,24hr_dialysis
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,False,False,1.0,True,False,False,False,False,0,0
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,False,False,1.0,True,False,False,False,True,0,0
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,False,False,1.0,False,True,False,False,False,0,0
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,False,False,1.0,True,False,False,False,False,0,0
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,False,False,1.0,False,True,False,False,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,False,False,1.0,True,False,False,False,False,0,0
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,False,False,1.0,False,False,False,True,False,0,0
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,False,False,1.0,False,True,False,False,False,0,0
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,False,False,1.0,False,False,False,True,False,0,0


### metabolicacidosis24


In [None]:
# Load all tables
diagnosis = pd.read_csv(eicu_path+"diagnosis.csv", usecols=['patientunitstayid', 'icd9code', 'diagnosisoffset'])
lab = pd.read_csv(eicu_path+"lab.csv", usecols=['patientunitstayid', 'labresultoffset', 'labname', 'labresult'])
delirium = comp_copy

In [None]:
diagnosis['icd9code']

0          414.00, I25.10
1                     NaN
2           491.20, J44.9
3           491.20, J44.9
4            428.0, I50.9
                ...      
2710667      599.0, N39.0
2710668      038.9, A41.9
2710669      584.9, N17.9
2710670      578.9, K92.2
2710671    415.19, I26.99
Name: icd9code, Length: 2710672, dtype: object

In [None]:
# exploration of the metabolicacidosis offset
metacid_diagnosis = diagnosis[diagnosis['icd9code'].isin(['276.4', '276.2, E87.2', '276.4, E87.4'])]
print(metacid_diagnosis['diagnosisoffset'].describe())
print("\n")
print("# diagnosis before the admission:",metacid_diagnosis[metacid_diagnosis['diagnosisoffset'] <= 0].shape[0])
print("# diagnosis up to 24hrs:",metacid_diagnosis[metacid_diagnosis['diagnosisoffset'] <= 1440].shape[0])

count     29438.000000
mean       4766.861811
std        7048.273270
min      -10061.000000
25%         638.000000
50%        2347.500000
75%        5819.500000
max      153813.000000
Name: diagnosisoffset, dtype: float64


# diagnosis before the admission: 193
# diagnosis up to 24hrs: 11659


In [None]:
# Pull metabolic acidosis from diagnosis table
# Pull corresponding metabolic acidosis icd9 codes
metacid_diagnosis = diagnosis[diagnosis['icd9code'].isin(['276.4', '276.2, E87.2', '276.4, E87.4'])]

# Only keep first 24 hours
metacid24_diagnosis = metacid_diagnosis[metacid_diagnosis['diagnosisoffset'] <= 1440]


# Metabolic acidosis is bicarb < 24 and pH <7.35 within 12 hours of each other
# Find bicarbonate from lab
bicarb_lab = lab[lab['labname'].isin(['Total CO2', 'bicarbonate', 'HCO3'])]
# First 24 hours
bicarb24_lab = bicarb_lab[bicarb_lab['labresultoffset'] <= 1440]
# Bicarbonate <24
ma_bicarb24_lab = bicarb24_lab[bicarb24_lab['labresult'] < 24]
# Find pH from lab
pH_lab = lab[lab['labname'] == 'pH']
pH24_lab = pH_lab[pH_lab['labresultoffset'] <= 1440]

# pH <7.35
ma_pH24_lab = pH24_lab[pH24_lab['labresult'] < 7.35]
# inner join to get only patients who had both lab results and make sure result offsets are within 12 hrs
ma_bipH24_lab = ma_bicarb24_lab.join(ma_pH24_lab.set_index('patientunitstayid'),
                                     on='patientunitstayid', how='inner', lsuffix='_bi', rsuffix='_pH')
ma_bipH24_lab['laboffsetdiff'] = abs(ma_bipH24_lab['labresultoffset_bi'] - ma_bipH24_lab['labresultoffset_pH'])
metacid24_lab = ma_bipH24_lab[ma_bipH24_lab['laboffsetdiff'] <= 720]


# outer join diagnosis and lab for full list
metacid24 = metacid24_diagnosis.join(metacid24_lab.set_index('patientunitstayid'), on='patientunitstayid', how='outer')\
    .drop_duplicates('patientunitstayid')
# Compare to list of delirium patients to add positive or negative metabolic acidosis diagnosis to relevant patients
delirium_metacid = delirium.merge(metacid24, on='patientunitstayid', how='left', indicator=True)
delirium_metacid['metabolicacidosis'] = np.where(delirium_metacid['_merge'] == 'both', 1, 0)

# Drop unnecessary columns
delirium_ma = delirium_metacid[['patientunitstayid', 'metabolicacidosis']]


# If patient has no diagnosis or lab data, designate them as unknown
all_pats = diagnosis.drop_duplicates('patientunitstayid').merge(lab.drop_duplicates('patientunitstayid'),
                                                                on='patientunitstayid', how='outer')
all_pats = all_pats[['patientunitstayid']]
delirium_ma = delirium_ma.merge(all_pats, how='left', indicator=True)
delirium_ma['24hr_metabolicacidosis'] = np.where(delirium_ma['_merge'] == 'left_only', 'na', delirium_ma['metabolicacidosis'])


In [None]:
delirium_ma = delirium_ma[['patientunitstayid', '24hr_metabolicacidosis']]
delirium_ma

Unnamed: 0,patientunitstayid,24hr_metabolicacidosis
0,242154,0
1,242290,0
2,242474,1
3,242505,0
4,242714,0
...,...,...
27058,3347353,0
27059,3347960,0
27060,3349086,0
27061,3349342,1


In [None]:
# merge
merged_df = pd.merge(merged_df, delirium_ma, on='patientunitstayid', how='left')
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,history_Rheumatic,urgentadmission,Dx_Medical,Dx_Neurology/neurosurgery,Dx_Surgery,Dx_Trauma,24hr_AKI,24hr_surgical,24hr_dialysis,24hr_metabolicacidosis
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,False,1.0,True,False,False,False,False,0,0,0
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,False,1.0,True,False,False,False,True,0,0,0
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,False,1.0,False,True,False,False,False,0,0,1
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,False,1.0,True,False,False,False,False,0,0,0
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,False,1.0,False,True,False,False,False,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,False,1.0,True,False,False,False,False,0,0,0
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,False,1.0,False,False,False,True,False,0,0,0
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,False,1.0,False,True,False,False,False,0,0,0
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,False,1.0,False,False,False,True,False,0,0,1


### VentilationBinary24Hours

In [None]:
#%% Load in data.

#Get all apacheapsvar
apache = pd.read_csv(eicu_path+"apacheApsVar.csv",
                     usecols=['patientunitstayid','vent'])
#Get physicalexam data.
phys = pd.read_csv(eicu_path+"physicalExam.csv",
                   usecols=['patientunitstayid','physicalexamoffset',
                            'physicalexamtext'])
#Get treatment
treat = pd.read_csv(eicu_path+"treatment.csv",
                    usecols=['patientunitstayid','treatmentoffset',
                            'treatmentstring'])

In [None]:
#%% Get feature and save it off.
#Only keep the stays we care about.
apache = apache[apache['patientunitstayid'].isin(comp['patientunitstayid'])]
phys = phys[phys['patientunitstayid'].isin(comp['patientunitstayid'])]
treat = treat[treat['patientunitstayid'].isin(comp['patientunitstayid'])]

# Only keep data up to the first 24 hours
phys = phys[phys['physicalexamoffset'] <= 1440]
treat = treat[treat['treatmentoffset'] <= 1440]

#Get ventilation data.
apache = apache[apache['vent']==1]
phys = phys[phys['physicalexamtext']=='ventilated']
mech = treat[treat['treatmentstring'].str.contains('mechanical ventilation')]
noninv = treat[treat['treatmentstring'].str.contains('non-invasive ventilation')]

#Just get patientunitstayids that had ventilation in first 24 hours.
vent_ids = pd.concat([phys[['patientunitstayid']],mech[['patientunitstayid']],
                      noninv[['patientunitstayid']],
                      apache[['patientunitstayid']]])
vent_ids.drop_duplicates(inplace=True)

In [None]:
merged_df['24hr_vented'] = merged_df['patientunitstayid'].isin(vent_ids['patientunitstayid'])
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,urgentadmission,Dx_Medical,Dx_Neurology/neurosurgery,Dx_Surgery,Dx_Trauma,24hr_AKI,24hr_surgical,24hr_dialysis,24hr_metabolicacidosis,24hr_vented
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,1.0,True,False,False,False,False,0,0,0,True
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,1.0,True,False,False,False,True,0,0,0,False
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,1.0,False,True,False,False,False,0,0,1,False
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,1.0,True,False,False,False,False,0,0,0,False
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,1.0,False,True,False,False,False,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,1.0,True,False,False,False,False,0,0,0,False
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,1.0,False,False,False,True,False,0,0,0,False
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,1.0,False,True,False,False,False,0,0,0,False
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,1.0,False,False,False,True,False,0,0,1,True


### TemperatureFirst24Hours

In [None]:
#%%Get temp data.
start_time = time.time()
temper_data = pd.read_csv(eicu_path+"nurseCharting.csv",nrows=0)
for chunk in pd.read_csv(eicu_path+"nurseCharting.csv",
                         chunksize=1000000):
    temp_rows = chunk[chunk['nursingchartcelltypevallabel']=='Temperature']
    temper_data = pd.concat([temper_data,temp_rows])

temper_data = temper_data[temper_data['patientunitstayid'].isin(comp['patientunitstayid'])]

#Get first 24 hour data only.
temper_data = temper_data[temper_data['nursingchartoffset'] <= 1440]
temper_data = temper_data[temper_data['nursingchartoffset'] >= 0]

#Only keep celsius data, discading location and F temperature data.
temper_data = temper_data[temper_data['nursingchartcelltypevalname']=='Temperature (C)']

#%%Figure out how much data is in C or F
# celsius = temper_data[temper_data['nursingchartcelltypevalname']=='Temperature (C)']
# fahrenheit = temper_data[temper_data['nursingchartcelltypevalname']=='Temperature (F)']
# celsius = celsius[['patientunitstayid','nursingchartoffset']]
# fahrenheit = fahrenheit[['patientunitstayid','nursingchartoffset']]
# merged = celsius.merge(fahrenheit,how='outer',on=['patientunitstayid','nursingchartoffset'],indicator=True)
# overlap = merged[merged['_merge']=='both']
#Most (99.9394%) of the C/F data is overlapping

#%% Discard columns I don't care about.
temper_data = temper_data[['patientunitstayid','nursingchartvalue']]
#Convert nursingchartvalue data to numbers.
temper_data['nursingchartvalue'] = temper_data['nursingchartvalue'].astype(float)


temper_data

Unnamed: 0,patientunitstayid,nursingchartvalue
18071135,242154,36.61404
18071266,242154,37.16964
18072460,242154,37.16964
18073188,242154,36.39180
18073256,242154,36.72516
...,...,...
151431749,3353077,36.60000
151431936,3353077,36.80000
151432131,3353077,36.80000
151432703,3353077,36.40000


In [None]:
#%%Put it in order. Get the min, max, and mean.
minimum = temper_data.groupby(by=['patientunitstayid']).min()
maximum = temper_data.groupby(by=['patientunitstayid']).max()
mean = temper_data.groupby(by=['patientunitstayid']).mean()

#Put all the data together, and save it off.
temp = pd.DataFrame(comp['patientunitstayid']).merge(minimum,how='left',on=['patientunitstayid'])
temp.rename(columns={'nursingchartvalue':'24hr_minimum_temp'},inplace=True)
temp = temp.merge(maximum,how='left',on=['patientunitstayid'])
temp.rename(columns={'nursingchartvalue':'24hr_maximum_temp'},inplace=True)
temp = temp.merge(mean,how='left',on=['patientunitstayid'])
temp.rename(columns={'nursingchartvalue':'24hr_mean_temp'},inplace=True)
temp

end_time = time.time()
total_time = end_time - start_time
print("Total time taken: {:.2f} seconds".format(total_time))

Total time taken: 464.68 seconds


In [None]:
merged_df = pd.merge(merged_df, temp, on='patientunitstayid', how='left')
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,Dx_Surgery,Dx_Trauma,24hr_AKI,24hr_surgical,24hr_dialysis,24hr_metabolicacidosis,24hr_vented,24hr_minimum_temp,24hr_maximum_temp,24hr_mean_temp
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,False,False,False,0,0,0,True,36.39180,37.16964,36.814056
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,False,False,True,0,0,0,False,36.78072,37.11408,36.928880
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,False,False,False,0,0,1,False,33.90000,37.55856,36.403583
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,False,False,False,0,0,0,False,36.50292,36.78072,36.653726
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,False,False,False,0,0,0,False,36.28068,37.55856,36.919620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,False,False,False,0,0,0,False,36.30000,36.90000,36.614286
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,False,True,False,0,0,0,False,36.50000,37.10000,36.680000
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,False,False,False,0,0,0,False,36.80000,37.00000,36.900000
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,False,True,False,0,0,1,True,36.60000,39.50000,37.900000


### MeanGCSFirst24hrs

- colab runtime clashes for this feature so just import it

In [None]:
gcs = pd.read_csv(feature_path+'first_24hr_GCS_feature.csv')
gcs

Unnamed: 0,patientunitstayid,24hrMeanMotor,24hrMeanVerbal,24hrMeanEyes,24hrMeanTotal
0,242154,5.8,4.111111,2.200000,12.111111
1,242290,6.0,5.000000,3.800000,14.800000
2,242474,6.0,4.727273,3.636364,14.363636
3,242505,6.0,4.909091,3.454545,14.363636
4,242714,6.0,4.000000,4.000000,14.000000
...,...,...,...,...,...
27058,3347353,,,,14.166667
27059,3347960,,,,12.250000
27060,3349086,,,,13.833333
27061,3349342,,,,7.000000


In [None]:
merged_df = pd.merge(merged_df, gcs, on='patientunitstayid', how='left')
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,24hr_dialysis,24hr_metabolicacidosis,24hr_vented,24hr_minimum_temp,24hr_maximum_temp,24hr_mean_temp,24hrMeanMotor,24hrMeanVerbal,24hrMeanEyes,24hrMeanTotal
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,0,0,True,36.39180,37.16964,36.814056,5.8,4.111111,2.200000,12.111111
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,0,0,False,36.78072,37.11408,36.928880,6.0,5.000000,3.800000,14.800000
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,0,1,False,33.90000,37.55856,36.403583,6.0,4.727273,3.636364,14.363636
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,0,0,False,36.50292,36.78072,36.653726,6.0,4.909091,3.454545,14.363636
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,0,0,False,36.28068,37.55856,36.919620,6.0,4.000000,4.000000,14.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,0,0,False,36.30000,36.90000,36.614286,,,,14.166667
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,0,0,False,36.50000,37.10000,36.680000,,,,12.250000
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,0,0,False,36.80000,37.00000,36.900000,,,,13.833333
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,0,1,True,36.60000,39.50000,37.900000,,,,7.000000




---



#### separate notebook (copied)

- cell blocks run from a separate notebook are: copied here below:

In [None]:
#Just get GCS data.
GCS_data = pd.read_csv(eicu_path+"nurseCharting.csv",
                       nrows=0,
                       usecols=['patientunitstayid','nursingchartoffset',
                                'nursingchartcelltypevallabel',
                                'nursingchartcelltypevalname',
                                'nursingchartvalue'])
print(comp.shape)
GCS_data.head()

(27063, 3)


Unnamed: 0,patientunitstayid,nursingchartoffset,nursingchartcelltypevallabel,nursingchartcelltypevalname,nursingchartvalue


In [None]:
keep_list = ['Glasgow coma score','Score (Glasgow Coma Scale)']
nurse_GCS = pd.read_csv(eicu_path+"nurseCharting.csv",
                         usecols=['patientunitstayid','nursingchartoffset',
                                  'nursingchartcelltypevallabel',
                                  'nursingchartcelltypevalname',
                                  'nursingchartvalue'])
nurse_GCS = nurse_GCS[nurse_GCS['nursingchartcelltypevallabel'].isin(keep_list)]
nurse_GCS

Unnamed: 0,patientunitstayid,nursingchartoffset,nursingchartcelltypevallabel,nursingchartcelltypevalname,nursingchartvalue
300,141924,2109,Glasgow coma score,GCS Total,15
480,141924,1894,Glasgow coma score,GCS Total,15
636,141924,424,Glasgow coma score,GCS Total,15
991,141944,484,Glasgow coma score,GCS Total,15
1022,141944,2389,Glasgow coma score,GCS Total,14
...,...,...,...,...,...
151603668,141924,7150,Glasgow coma score,GCS Total,15
151603858,141924,5269,Glasgow coma score,GCS Total,15
151603906,141924,2384,Glasgow coma score,GCS Total,15
151603938,141924,649,Glasgow coma score,GCS Total,15


In [None]:
#Only keep GCS data for patients we care about.
GCS_data = nurse_GCS
GCS_data = GCS_data[GCS_data['patientunitstayid'].isin(comp['patientunitstayid'])]

print("~24hr:",nurse_GCS[nurse_GCS['nursingchartoffset']<=1440].shape[0])
print("~admission:",nurse_GCS[nurse_GCS['nursingchartoffset']<0].shape[0])
print("adm~24hr:",nurse_GCS[(nurse_GCS['nursingchartoffset']>=0)
                            &(nurse_GCS['nursingchartoffset']<=1440)].shape[0])

~24hr: 3879906
~admission: 551338
adm~24hr: 3328568


In [None]:
#%%Process data.
#Drop data outside the first 24 hour of ICU stay for each patient.
GCS_data = GCS_data[(GCS_data['nursingchartoffset']>=0) & (GCS_data['nursingchartoffset']<=1440)]
GCS_data.drop('nursingchartoffset',axis=1,inplace=True)

#Make the data all numeric.
GCS_data['patientunitstayid'] = pd.to_numeric(GCS_data['patientunitstayid'],
                                              errors='coerce')
GCS_data['nursingchartvalue'] = pd.to_numeric(GCS_data['nursingchartvalue'],
                                              errors='coerce')

#Split out data for the different parts.
motor_data = GCS_data[GCS_data['nursingchartcelltypevalname']=='Motor']
verbal_data = GCS_data[GCS_data['nursingchartcelltypevalname']=='Verbal']
eyes_data = GCS_data[GCS_data['nursingchartcelltypevalname']=='Eyes']
total_list = ['GCS Total','Value']
total_data = GCS_data[GCS_data['nursingchartcelltypevalname'].isin(total_list)]

#Only keep columns we care about.
motor_data = motor_data[['patientunitstayid','nursingchartvalue']]
verbal_data = verbal_data[['patientunitstayid','nursingchartvalue']]
eyes_data = eyes_data[['patientunitstayid','nursingchartvalue']]
total_data = total_data[['patientunitstayid','nursingchartvalue']]


#%% Get mean GCS for each part of the score, and each patient stay.

#Generate column of mean GCS for each ID and offset
mean_motor = motor_data.groupby('patientunitstayid').mean().reset_index(drop=False)
mean_motor.rename(columns={'nursingchartvalue':'24hrMeanMotor'},inplace=True)
temp = pd.DataFrame(comp['patientunitstayid']).merge(mean_motor,how='left',on='patientunitstayid')

mean_verbal = verbal_data.groupby('patientunitstayid').mean().reset_index(drop=False)
mean_verbal.rename(columns={'nursingchartvalue':'24hrMeanVerbal'},inplace=True)
temp = temp.merge(mean_verbal,how='left',on='patientunitstayid')

mean_eyes = eyes_data.groupby('patientunitstayid').mean().reset_index(drop=False)
mean_eyes.rename(columns={'nursingchartvalue':'24hrMeanEyes'},inplace=True)
temp = temp.merge(mean_eyes,how='left',on='patientunitstayid')

mean_total = total_data.groupby('patientunitstayid').mean().reset_index(drop=False)
mean_total.rename(columns={'nursingchartvalue':'24hrMeanTotal'},inplace=True)
temp = temp.merge(mean_total,how='left',on='patientunitstayid')

In [None]:
temp

Unnamed: 0,patientunitstayid,24hrMeanMotor,24hrMeanVerbal,24hrMeanEyes,24hrMeanTotal
0,242154,5.8,4.111111,2.200000,12.111111
1,242290,6.0,5.000000,3.800000,14.800000
2,242474,6.0,4.727273,3.636364,14.363636
3,242505,6.0,4.909091,3.454545,14.363636
4,242714,6.0,4.000000,4.000000,14.000000
...,...,...,...,...,...
27058,3347353,,,,14.166667
27059,3347960,,,,12.250000
27060,3349086,,,,13.833333
27061,3349342,,,,7.000000


### RASSComaFirst24hrs

- colab runtime clashes for this feature so just import it

In [None]:
rasscoma = pd.read_csv(feature_path+'first_24hr_rass_and_coma_feature.csv')

# Rename the columns
rasscoma = rasscoma.rename(columns={
    'First24hrMinRASS': '24hr_MinRASS',
    'First24hrMeanRASS': '24hr_MeanRASS',
    'First24hrMaxRASS': '24hr_MaxRASS'
})

rasscoma

Unnamed: 0,patientunitstayid,First24hrComa,24hr_MinRASS,24hr_MeanRASS,24hr_MaxRASS
0,242154,0.0,-3.0,-2.272727,-2.0
1,242290,,,,
2,242474,0.0,-1.0,-0.400000,0.0
3,242505,,,,
4,242714,,,,
...,...,...,...,...,...
27058,3347353,,,,
27059,3347960,,,,
27060,3349086,,,,
27061,3349342,,,,


In [None]:
merged_df = pd.merge(merged_df, rasscoma, on='patientunitstayid', how='left')
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,24hr_maximum_temp,24hr_mean_temp,24hrMeanMotor,24hrMeanVerbal,24hrMeanEyes,24hrMeanTotal,First24hrComa,24hr_MinRASS,24hr_MeanRASS,24hr_MaxRASS
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,37.16964,36.814056,5.8,4.111111,2.200000,12.111111,0.0,-3.0,-2.272727,-2.0
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,37.11408,36.928880,6.0,5.000000,3.800000,14.800000,,,,
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,37.55856,36.403583,6.0,4.727273,3.636364,14.363636,0.0,-1.0,-0.400000,0.0
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,36.78072,36.653726,6.0,4.909091,3.454545,14.363636,,,,
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,37.55856,36.919620,6.0,4.000000,4.000000,14.000000,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,36.90000,36.614286,,,,14.166667,,,,
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,37.10000,36.680000,,,,12.250000,,,,
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,37.00000,36.900000,,,,13.833333,,,,
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,39.50000,37.900000,,,,7.000000,,,,




---



#### separate notebook (copied)

- cell blocks run from a separate notebook are: copied here below:

In [None]:
pd.read_csv(eicu_path+"nurseCharting.csv", usecols=['patientunitstayid']).shape

(151604232, 1)

In [None]:
rass_data = pd.read_csv(eicu_path+"nurseCharting.csv",
                        # nrows=100000000,
                        usecols=['patientunitstayid','nursingchartoffset',
                                 'nursingchartcelltypevallabel',
                                 'nursingchartcelltypevalname',
                                 'nursingchartvalue'])
keep_list = ['RASS','SEDATION SCORE','Sedation Scale/Score/Goal']
rass_data = rass_data[rass_data['nursingchartcelltypevallabel'].isin(keep_list)]
rass_data

Unnamed: 0,patientunitstayid,nursingchartoffset,nursingchartcelltypevallabel,nursingchartcelltypevalname,nursingchartvalue
18070682,242154,6604,Sedation Scale/Score/Goal,Sedation Score,-1
18070731,242154,7680,Sedation Scale/Score/Goal,Sedation Score,-2
18070758,242154,3999,Sedation Scale/Score/Goal,Sedation Scale,RASS
18070763,242154,5162,Sedation Scale/Score/Goal,Sedation Scale,RASS
18070826,242154,6604,Sedation Scale/Score/Goal,Sedation Scale,RASS
...,...,...,...,...,...
147501799,3246235,913,Sedation Scale/Score/Goal,Sedation Score,0
147501839,3246235,1823,Sedation Scale/Score/Goal,Sedation Score,0
147501843,3246235,1823,Sedation Scale/Score/Goal,Sedation Scale,RASS
147501847,3246235,2041,Sedation Scale/Score/Goal,Sedation Scale,RASS


In [None]:
rass_data_copy = rass_data.copy()

In [None]:
rass_data = rass_data_copy
rass_data

Unnamed: 0,patientunitstayid,nursingchartoffset,nursingchartcelltypevallabel,nursingchartcelltypevalname,nursingchartvalue
18070682,242154,6604,Sedation Scale/Score/Goal,Sedation Score,-1
18070731,242154,7680,Sedation Scale/Score/Goal,Sedation Score,-2
18070758,242154,3999,Sedation Scale/Score/Goal,Sedation Scale,RASS
18070763,242154,5162,Sedation Scale/Score/Goal,Sedation Scale,RASS
18070826,242154,6604,Sedation Scale/Score/Goal,Sedation Scale,RASS
...,...,...,...,...,...
147501799,3246235,913,Sedation Scale/Score/Goal,Sedation Score,0
147501839,3246235,1823,Sedation Scale/Score/Goal,Sedation Score,0
147501843,3246235,1823,Sedation Scale/Score/Goal,Sedation Scale,RASS
147501847,3246235,2041,Sedation Scale/Score/Goal,Sedation Scale,RASS


In [None]:
# Merge the sedation score and scale data together to see what was worth keeping.
#%% Clean up and combine the RASS data.

#Get the 'RASS' data.
rass = rass_data[rass_data['nursingchartcelltypevallabel']=='RASS']
#Get the 'SEDATION SCORE' data.
caps_score = rass_data[rass_data['nursingchartcelltypevallabel']=='SEDATION SCORE']
#Get the other data.
scale = rass_data[rass_data['nursingchartcelltypevallabel']=='Sedation Scale/Score/Goal']
scale = scale[scale['nursingchartcelltypevalname']=='Sedation Scale']

#Drop the data that isn't RASS.
scale = scale[scale['nursingchartvalue']=='RASS']
scale = scale[['patientunitstayid','nursingchartoffset']]

#Get the scores.
score = rass_data[rass_data['nursingchartcelltypevallabel']=='Sedation Scale/Score/Goal']
score = score[score['nursingchartcelltypevalname']=='Sedation Score']
#Only keep the scores that were RASS.
score = score.merge(scale,on=['patientunitstayid','nursingchartoffset'],how='inner')

In [None]:
print(rass.shape)
rass.head()

(344272, 5)


Unnamed: 0,patientunitstayid,nursingchartoffset,nursingchartcelltypevallabel,nursingchartcelltypevalname,nursingchartvalue
67910511,1553444,150,RASS,Value,-4
67910563,1553444,18,RASS,Value,-3
67910564,1553444,2070,RASS,Value,1
67910565,1553444,10562,RASS,Value,0
67910568,1553444,570,RASS,Value,-2


In [None]:
print(caps_score.shape)
caps_score.head()

(264608, 5)


Unnamed: 0,patientunitstayid,nursingchartoffset,nursingchartcelltypevallabel,nursingchartcelltypevalname,nursingchartvalue
85117010,1964009,1194,SEDATION SCORE,Value,-2
85117190,1964009,2672,SEDATION SCORE,Value,-2
85119274,1964192,1743,SEDATION SCORE,Value,0
85119322,1964192,3123,SEDATION SCORE,Value,-1
85119337,1964192,6003,SEDATION SCORE,Value,-3


In [None]:
print(scale.shape)
scale.head()

(1398122, 2)


Unnamed: 0,patientunitstayid,nursingchartoffset
18070758,242154,3999
18070763,242154,5162
18070826,242154,6604
18070843,242154,6370
18070883,242154,6034


In [None]:
print(score.shape)
score.head()

(1377665, 5)


Unnamed: 0,patientunitstayid,nursingchartoffset,nursingchartcelltypevallabel,nursingchartcelltypevalname,nursingchartvalue
0,242154,6604,Sedation Scale/Score/Goal,Sedation Score,-1
1,242154,7680,Sedation Scale/Score/Goal,Sedation Score,-2
2,242154,6370,Sedation Scale/Score/Goal,Sedation Score,-1
3,242154,5162,Sedation Scale/Score/Goal,Sedation Score,-2
4,242154,6034,Sedation Scale/Score/Goal,Sedation Score,-2


In [None]:
#%%Process data.
#Combine all 3 sources of RASS data.
rass_data = pd.concat([rass,caps_score,score], axis=0)
rass_data = rass_data[['patientunitstayid', 'nursingchartoffset','nursingchartvalue']]

#Only keep rass data for patients we care about.
rass_data = rass_data[rass_data['patientunitstayid'].isin(comp['patientunitstayid'])]

#Drop data outside the first 24 hour of ICU stay for each patient.
rass_data = rass_data[rass_data['nursingchartoffset']>=0]
rass_data = rass_data[rass_data['nursingchartoffset']<=1440]

rass_data

Unnamed: 0,patientunitstayid,nursingchartoffset,nursingchartvalue
6,242154,1350,-2
14,242154,871,-2
18,242154,527,-3
22,242154,1243,-2
23,242154,995,-2
...,...,...,...
1376801,3160544,274,-1
1376802,3160544,94,-2
1376803,3160544,994,0
1376903,3160572,4,0


In [None]:
#Make the data all numeric.
rass_data['patientunitstayid'] = pd.to_numeric(rass_data['patientunitstayid'], errors='coerce')
rass_data['nursingchartvalue'] = pd.to_numeric(rass_data['nursingchartvalue'], errors='coerce')
#Drop offset.
rass_data = rass_data[['patientunitstayid','nursingchartvalue']]

#%% Get if each patient had coma (RASS of -4/-5)
#Any patients that had RASS data start off marked as no coma.
had_rass = rass_data['patientunitstayid'].drop_duplicates()
comp['had_rass'] = comp['patientunitstayid'].isin(had_rass)
coma = rass_data[rass_data['nursingchartvalue']<=-4]
coma = coma['patientunitstayid'].drop_duplicates()
comp['had_coma'] = comp['patientunitstayid'].isin(coma)

def coma_feature(has_rass,has_coma):
    if has_rass == False:
        return np.nan
    elif has_coma == True:
        return 1
    else:
        return 0

comp['First24hrComa'] = comp.apply(lambda row: coma_feature(row['had_rass'],
                                                            row['had_coma']),
                                   axis=1)

comp = comp[['patientunitstayid','First24hrComa']]

#%% Get each patient's min/mean/max RASS score in the first 24 hours.

min_rass = rass_data.groupby('patientunitstayid').min().reset_index()\
    .rename(columns={'nursingchartvalue':'First24hrMinRASS'})
mean_rass = rass_data.groupby('patientunitstayid').mean().reset_index()\
    .rename(columns={'nursingchartvalue':'First24hrMeanRASS'})
max_rass = rass_data.groupby('patientunitstayid').max().reset_index()\
    .rename(columns={'nursingchartvalue':'First24hrMaxRASS'})

comp = comp.merge(min_rass,on='patientunitstayid',how='left')
comp = comp.merge(mean_rass,on='patientunitstayid',how='left')
comp = comp.merge(max_rass,on='patientunitstayid',how='left')

In [None]:
comp

Unnamed: 0,patientunitstayid,First24hrComa,First24hrMinRASS,First24hrMeanRASS,First24hrMaxRASS
0,242154,0.0,-3.0,-2.272727,-2.0
1,242290,,,,
2,242474,0.0,-1.0,-0.400000,0.0
3,242505,,,,
4,242714,,,,
...,...,...,...,...,...
27058,3347353,,,,
27059,3347960,,,,
27060,3349086,,,,
27061,3349342,,,,


In [None]:
comp
comp.to_csv('first_24hr_rass_and_coma_feature_1004.csv',index=False)

##### Feature exploration

- Explore data to figure out if RASS, SEDATION SCORE, and Sedation Scale/Score/Goal are equivalent

In [None]:
# 1) Explore RASS
rass = rass_data[rass_data['nursingchartcelltypevallabel']=='RASS']
rass['nursingchartvalue'] = pd.to_numeric(rass['nursingchartvalue'])
rass['nursingchartvalue'].value_counts().sort_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


-5      9988
-4      8343
-3     15082
-2     31240
-1     91823
 0    158014
 1     23085
 2      5249
 3      1223
 4       225
Name: nursingchartvalue, dtype: int64

In [None]:
# 2) Explore SEDATION SCORE
caps_sedat = rass_data[rass_data['nursingchartcelltypevallabel']=='SEDATION SCORE']
caps_sedat['nursingchartvalue'] = pd.to_numeric(caps_sedat['nursingchartvalue'])
caps_sedat['nursingchartvalue'].value_counts().sort_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


-5    15149
-4    11928
-3    17577
-2    38797
-1    62253
 0    96182
 1    16333
 2     4905
 3     1264
 4      220
Name: nursingchartvalue, dtype: int64

In [None]:
low_sedat = rass_data[rass_data['nursingchartcelltypevallabel']=='Sedation Scale/Score/Goal']
low_sedat_counts = low_sedat['nursingchartvalue'].value_counts().sort_index()
print(low_sedat_counts, "\n")
low_scale = low_sedat[low_sedat['nursingchartcelltypevalname']=='Sedation Scale']
low_scale_counts = low_scale['nursingchartvalue'].value_counts().sort_index()
print('Sedation Scale','\n',low_scale_counts, "\n")

low_score = low_sedat[low_sedat['nursingchartcelltypevalname']=='Sedation Score']
low_score_counts = low_score['nursingchartvalue'].value_counts().sort_index()
print('Sedation Score','\n',low_score_counts, "\n")

low_goal = low_sedat[low_sedat['nursingchartcelltypevalname']=='Sedation Goal']
low_goal_counts = low_goal['nursingchartvalue'].value_counts().sort_index()
print('Sedation Goal','\n',low_goal_counts)

-1         328330
-2         234613
-3         112648
-4          62456
-5          62127
0         1028204
00             35
01              3
1           82865
2           33396
3           24016
4           35012
5            5109
6            1404
7              95
MAAS        14682
RASS      1398122
Ramsay       6462
SAS         52010
Name: nursingchartvalue, dtype: int64 

Sedation Scale 
 MAAS        14682
RASS      1398122
Ramsay       6462
SAS         52010
Name: nursingchartvalue, dtype: int64 

Sedation Score 
 -1    288282
-2    155733
-3     87170
-4     51608
-5     52744
0     637211
00        12
01         3
1      82055
2      32961
3      22062
4      34362
5       5074
6       1257
7         95
Name: nursingchartvalue, dtype: int64 

Sedation Goal 
 -1     40048
-2     78880
-3     25478
-4     10848
-5      9383
0     390993
00        23
1        810
2        435
3       1954
4        650
5         35
6        147
Name: nursingchartvalue, dtype: int64


- Sedation score + Sedation Scale = Sedation Scale/Score/Goal

### Infection24Hours

In [None]:
#Get diagnosis information.
diag = pd.read_csv(eicu_path+'diagnosis.csv',
                   usecols=['patientunitstayid','diagnosisoffset',
                            'diagnosisstring','icd9code'])

#%% Get infection info.
#Only keep the stays that had delirium testing.
diag = diag[diag['patientunitstayid'].isin(comp['patientunitstayid'])]

#Only keep diagnoses done in the first 24 hours of the ICU stay or earlier.
diag = diag[diag['diagnosisoffset']<=1440]

#Make it all lowercase
diag = diag.applymap(lambda s:s.lower() if type(s) == str else s)

In [None]:
#Only keep the stays that had delirium testing.

print("~24hr:",diag[diag['diagnosisoffset']<=1440].shape[0])
print("~admission:",diag[diag['diagnosisoffset']<0].shape[0])

~24hr: 156989
~admission: 2441


In [None]:
diag

Unnamed: 0,patientunitstayid,diagnosisoffset,diagnosisstring,icd9code
73403,242154,204,pulmonary|respiratory failure|acute respirator...,"518.81, j96.00"
73405,242154,204,pulmonary|pulmonary infections|pneumonia|aspir...,"507.0, j69.0"
73413,242154,687,pulmonary|respiratory failure|acute respirator...,"518.81, j96.00"
73414,242154,687,pulmonary|pulmonary infections|pneumonia|aspir...,"507.0, j69.0"
73421,242290,165,infectious diseases|systemic/other infections|...,995.90
...,...,...,...,...
2710066,3353077,95,neurologic|cns mass lesions|cerebral mass of u...,784.2
2710067,3353077,1404,neurologic|misc|headache,r51
2710071,3353077,1404,neurologic|misc|headache,r51
2710074,3353077,1404,neurologic|cns mass lesions|brain abscess,"324.0, g06.0"


In [None]:
#This function takes in the icd9code as a lower case string, harvests the
#first part if present, removes any entries with letters, and converts it to a float.
#and returns nan if there isn't any value.
def shorten_icd9(icd9):
    if type(icd9) == float:
        return np.nan
    else:
        #Get the first part separated by commas
        icd9 = icd9.split(',')[0]
        #Check if it's got letters in it. If so, get rid of it.
        if icd9.upper() != icd9:
                return np.nan
        else:
            return float(icd9)

diag['icd9'] = diag.apply(lambda row: shorten_icd9(row['icd9code']),axis=1)

#Load lists of icd9 codes to look for.
rounded_codes = pd.read_csv(feature_path+'ICD9_codes_rounded.csv',header=None)
rounded_codes = rounded_codes.values.astype(str).tolist()[0]
exact_codes = pd.read_csv(feature_path+'ICD9_codes_exact.csv',header=None)
exact_codes = exact_codes.values.astype(str).tolist()[0]

#Keep the rows that have ICD9 codes related to infections.
rounded_code_stays = diag[np.floor(diag['icd9']).isin(rounded_codes)]
exact_code_stays = diag[diag['icd9'].isin(exact_codes)]

#Keep the rows where the diagnosis string contains these words
search_terms_list = ['infection','infectious']
string_stays = diag[diag['diagnosisstring'].str.contains(
    '|'.join(search_terms_list),na=False)]
#Drop the rows where diagnosis string contains "non-infectious"
string_stays = string_stays[np.logical_not(
    string_stays['diagnosisstring'].str.contains('non-infectious'))]

#Combine all the stays.
all_stays = pd.concat([rounded_code_stays,exact_code_stays,string_stays])
all_stays.drop_duplicates(inplace=True)
all_stays.sort_values(['patientunitstayid','diagnosisoffset'],inplace=True)

#Just get the stay IDs
all_stays = all_stays[['patientunitstayid']]
all_stays.drop_duplicates(inplace=True)

In [None]:
#Create a column for the infection feature
comp['24hr_infection'] = comp['patientunitstayid'].isin(all_stays['patientunitstayid'])
infection = comp[['patientunitstayid','24hr_infection']]
comp = comp_copy
infection

Unnamed: 0,patientunitstayid,24hr_infection
0,242154,True
1,242290,True
2,242474,False
3,242505,True
4,242714,True
...,...,...
27058,3347353,False
27059,3347960,True
27060,3349086,False
27061,3349342,False


In [None]:
merged_df = pd.merge(merged_df, infection, on='patientunitstayid', how='left')
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,24hr_mean_temp,24hrMeanMotor,24hrMeanVerbal,24hrMeanEyes,24hrMeanTotal,First24hrComa,24hr_MinRASS,24hr_MeanRASS,24hr_MaxRASS,24hr_infection
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,36.814056,5.8,4.111111,2.200000,12.111111,0.0,-3.0,-2.272727,-2.0,True
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,36.928880,6.0,5.000000,3.800000,14.800000,,,,,True
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,36.403583,6.0,4.727273,3.636364,14.363636,0.0,-1.0,-0.400000,0.0,False
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,36.653726,6.0,4.909091,3.454545,14.363636,,,,,True
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,36.919620,6.0,4.000000,4.000000,14.000000,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,36.614286,,,,14.166667,,,,,False
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,36.680000,,,,12.250000,,,,,True
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,36.900000,,,,13.833333,,,,,False
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,37.900000,,,,7.000000,,,,,False


### Final feature list + df extraction

In [None]:
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,24hr_mean_temp,24hrMeanMotor,24hrMeanVerbal,24hrMeanEyes,24hrMeanTotal,First24hrComa,24hr_MinRASS,24hr_MeanRASS,24hr_MaxRASS,24hr_infection
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,36.814056,5.8,4.111111,2.200000,12.111111,0.0,-3.0,-2.272727,-2.0,True
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,36.928880,6.0,5.000000,3.800000,14.800000,,,,,True
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,36.403583,6.0,4.727273,3.636364,14.363636,0.0,-1.0,-0.400000,0.0,False
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,36.653726,6.0,4.909091,3.454545,14.363636,,,,,True
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,36.919620,6.0,4.000000,4.000000,14.000000,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,36.614286,,,,14.166667,,,,,False
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,36.680000,,,,12.250000,,,,,True
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,36.900000,,,,13.833333,,,,,False
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,37.900000,,,,7.000000,,,,,False


In [None]:
merged_df.columns

Index(['patientunitstayid', 'Class', 'final_offset', 'apachescore', 'gender',
       'age', 'ethnicity', 'apacheadmissiondx', 'admissionheight',
       'hospitaladmitsource', 'unittype', 'unitadmitsource', 'unitvisitnumber',
       'unitstaytype', 'admissionweight', 'numbedscategory', 'teachingstatus',
       'region', 'history_AICD', 'history_Angina', 'history_Arrythmia',
       'history_CHF', 'history_CABG', 'history_Hypertension', 'history_MI',
       'history_Pacemaker', 'history_PVD', 'history_PCI', 'history_PulmEmb',
       'history_HeartTransp', 'history_ValveDis', 'history_VenThromb',
       'history_Cushing', 'history_Hypercalcemia', 'history_hyperthyroid',
       'history_hypothyroid', 'history_diabetes', 'history_Steroid Use',
       'history_Cirrhosis', 'history_Hypersplenism', 'history_PUD',
       'history_LiverTransp', 'history_AplasticAnemia', 'history_Chemotherapy',
       'history_RadiationTherapy', 'history_Cancer',
       'history_ClottingDisorder', 'history_Hemolyt

In [None]:
# save off the df
merged_df.to_csv(feature_path+'complete_merged_features.csv',index=False)

In [None]:
# check if the df has been successfully saved
merged_df = pd.read_csv(feature_path+'complete_merged_features.csv')
merged_df

Unnamed: 0,patientunitstayid,Class,final_offset,apachescore,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,...,24hr_mean_temp,24hrMeanMotor,24hrMeanVerbal,24hrMeanEyes,24hrMeanTotal,First24hrComa,24hr_MinRASS,24hr_MeanRASS,24hr_MaxRASS,24hr_infection
0,242154,1,5640.0,77.0,Female,46.0,Caucasian,"Sepsis, pulmonary",157.48,Direct Admit,...,36.814056,5.8,4.111111,2.200000,12.111111,0.0,-3.0,-2.272727,-2.0,True
1,242290,0,1362.0,77.0,Female,75.0,Other/Unknown,"Sepsis, unknown",165.10,Direct Admit,...,36.928880,6.0,5.000000,3.800000,14.800000,,,,,True
2,242474,0,807.0,64.0,Male,52.0,Native American,Drug withdrawal,177.80,,...,36.403583,6.0,4.727273,3.636364,14.363636,0.0,-1.0,-0.400000,0.0,False
3,242505,1,5900.0,78.0,Female,90.0,Caucasian,"Sepsis, cutaneous/soft tissue",165.10,Emergency Department,...,36.653726,6.0,4.909091,3.454545,14.363636,,,,,True
4,242714,0,2712.0,-1.0,Male,78.0,Caucasian,"Hematoma, subdural",182.90,Direct Admit,...,36.919620,6.0,4.000000,4.000000,14.000000,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058,3347353,1,4.0,74.0,Male,67.0,Caucasian,"Encephalopathy, hepatic",172.70,Emergency Department,...,36.614286,,,,14.166667,,,,,False
27059,3347960,1,651.0,61.0,Female,88.0,Caucasian,Pelvis/hip trauma,170.20,Floor,...,36.680000,,,,12.250000,,,,,True
27060,3349086,1,574.0,66.0,Male,68.0,Caucasian,"CVA, cerebrovascular accident/stroke",188.00,Direct Admit,...,36.900000,,,,13.833333,,,,,False
27061,3349342,1,10390.0,25.0,Male,34.0,African American,Head only trauma,167.60,Emergency Department,...,37.900000,,,,7.000000,,,,,False


In [None]:
# save off the list of column names
pd.DataFrame(merged_df.columns.to_list()).to_csv(feature_path+'featurelist_dynamic.csv',index=False)