## Importing the dataset

In [1]:
!python -V

Python 3.6.3 :: Anaconda custom (64-bit)


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

pd.__version__

  (fname, cnt))
  (fname, cnt))


'0.22.0'

In [4]:
data = pd.read_csv('../data/cardiac_data.csv', skipinitialspace=True)
data.drop(data.columns[0], axis=1, inplace=True) # drop the first column of patient id

In [5]:
data.shape

(48165, 168)

In [6]:
data.head()

Unnamed: 0,ïsiteid,hospitalclassificationid,hospitalclassification,publicprivateid,publicprivate,patientid,icuadmityyyymm,icuadmitfinyr,icuadmityyyy,icuadmitmonth,...,anzrodisincluded,anzrodissmr,anzrodriskofdeath,apache3isincluded,apache3issmr,apache3riskofdeath,apache3score,apache2score,infectedall1,majordiag
0,47,1,Rural / Regional,4,Public,2,201710,2017-18,2017,10,...,1,1,0.032667,1,1,0.080221,61.0,19.0,0,med_other
1,47,1,Rural / Regional,4,Public,230,201710,2017-18,2017,10,...,1,1,0.215668,1,0,0.130564,53.0,25.0,0,cardiacarrest
2,108,3,Tertiary,4,Public,5838590P120,201709,2017-18,2017,9,...,1,1,0.652234,1,1,0.632856,109.0,30.0,0,cardiacarrest
3,178,3,Tertiary,4,Public,2270096,201709,2017-18,2017,9,...,1,1,0.156578,1,1,0.337196,75.0,15.0,0,cardiacarrest
4,223,3,Tertiary,4,Public,3032270,201709,2017-18,2017,9,...,1,1,0.102439,1,1,0.057196,39.0,10.0,0,cardiacarrest


In [7]:
data.info(max_cols=data.shape[1])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48165 entries, 0 to 48164
Data columns (total 168 columns):
ïsiteid                      48165 non-null int64
hospitalclassificationid     48165 non-null int64
hospitalclassification       48165 non-null object
publicprivateid              48165 non-null int64
publicprivate                48165 non-null object
patientid                    48165 non-null object
icuadmityyyymm               48165 non-null int64
icuadmitfinyr                48165 non-null object
icuadmityyyy                 48165 non-null int64
icuadmitmonth                48165 non-null int64
icuadmitweekday              48165 non-null int64
icuadmithour                 48165 non-null int64
icu_ad_dtm                   48165 non-null object
icudisyyyy                   48148 non-null float64
icudismonth                  48148 non-null float64
icudisweekday                48148 non-null float64
icudishour                   48148 non-null float64
icu_ds_dtm                 

## Check patient dropoff for consort diagram

In [10]:
data_cardarr = data[
    ((data.ap3diag == 102) | (data.ap3_subcode == 102.01) | (data.cardarrest == 1)) # admitted for cardiac arrest
]
data_cardarr.shape

(48165, 168)

Drop patients who have a treatment limitation (since most cases have no treatment limitation, treats NA values as no limitation)

In [11]:
previous_shape = data_cardarr.shape[0]

data_cardarr = data_cardarr[
    (data_cardarr.treat_lmt.isin([1,np.nan])) # no treatment limitation
]

print('Rows removed: ', (previous_shape - data_cardarr.shape[0]))
print('New shape: ', data_cardarr.shape)

Rows removed:  4667
New shape:  (43498, 168)


Drop patients whose ICU outcome is transferred to another ICU. Include NA values

In [12]:
previous_shape = data_cardarr.shape[0]

data_cardarr = data_cardarr[
    (data_cardarr.icu_outcm.isin([2,3,6,np.nan])) # icu_outcm shows either died(2) or survived(3) or transferred hosp (6)
]

print('Rows removed: ', (previous_shape - data_cardarr.shape[0]))
print('New shape: ', data_cardarr.shape)

Rows removed:  1642
New shape:  (41856, 168)


Drop readmissions

In [13]:
previous_shape = data_cardarr.shape[0]

data_cardarr = data_cardarr[
    (data.readmitted == 0) # Include only first-time admissions
]

print('Rows removed: ', (previous_shape - data_cardarr.shape[0]))
print('New shape: ', data_cardarr.shape)

Rows removed:  2069
New shape:  (39787, 168)


  after removing the cwd from sys.path.


Include only first-time admissions

In [14]:
previous_shape = data_cardarr.shape[0]

data_cardarr = data_cardarr[
    (data.admepisode.isin([0,1,np.nan])) # Include first-time admissions only
]

print('Rows removed: ', (previous_shape - data_cardarr.shape[0]))
print('New shape: ', data_cardarr.shape)

Rows removed:  44
New shape:  (39743, 168)


  after removing the cwd from sys.path.


## Select rows of interest

**Check for inconsistency**  
*check `icu_outcm` and `died_icu`*  
`icu_outcm`: 2 - died; 3 - survived  
`died_icu`: 0 - survived; 1 - died

In [15]:
pd.crosstab(data_cardarr.icu_outcm, data_cardarr.died_icu)

died_icu,0.0,1.0
icu_outcm,Unnamed: 1_level_1,Unnamed: 2_level_1
2.0,0,15458
3.0,23167,0
6.0,868,0


In [16]:
pd.crosstab(data_cardarr.admepisode, data_cardarr.died_icu)

died_icu,0.0,1.0
admepisode,Unnamed: 1_level_1,Unnamed: 2_level_1
0,22667,15447
1,1368,11


*check `hosp_outcm` and `died_hosp`*  
`hosp_outcm`: 2 - died; others - survived  
`died_hosp`: 0 - survived; 1 - died

In [17]:
pd.crosstab(data_cardarr.hosp_outcm, data_cardarr.died_hosp)

died_hosp,0.0,1.0
hosp_outcm,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,8,0
2.0,0,18023
3.0,14666,0
4.0,2602,0
5.0,167,0
6.0,4095,0
7.0,22,0
9.0,3,0


*check `died_icu` and `died_hosp`*  

In [18]:
pd.crosstab(data_cardarr.died_icu, data_cardarr.died_hosp)

died_hosp,0.0,1.0
died_icu,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,21393,2500
1.0,13,15442


Inconsistency found for 13 people who reported died in icu but survived in hospital.

In [19]:
pd.crosstab(data_cardarr.died_icu, data_cardarr.died_hosp, normalize=0)

died_hosp,0.0,1.0
died_icu,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.895367,0.104633
1.0,0.000841,0.999159


**Although there are still 13% chance for people who suffer from cardiac arrest survived icu ended up didn't make it through hospital**

now remove those inconsistent data

In [20]:
previous_shape = data_cardarr.shape[0]

data_cardarr = data_cardarr[
    ~((data_cardarr.died_icu == 1) & (data_cardarr.died_hosp == 0))
]

print('Rows removed: ', (previous_shape - data_cardarr.shape[0]))
print('New shape: ', data_cardarr.shape)

Rows removed:  13
New shape:  (39730, 168)


Remove those that dont have sex recorded

In [21]:
previous_shape = data_cardarr.shape[0]

data_cardarr = data_cardarr[
    ~(data_cardarr.sex.isna())
]

print('Rows removed: ', (previous_shape - data_cardarr.shape[0]))
print('New shape: ', data_cardarr.shape)

Rows removed:  7
New shape:  (39723, 168)


## Select relevant variables (candidates)

**Target variable**  
This model is to predict the mortality at the time of discharged from hospital, so `died_hosp` is the target variable, and remove `hosp_outcm`, `icu_outcm` and `died_icu`.  

In [22]:
target_var = ['died_hosp']

**Pick relavent features**
Some variables are obvious irrelavent. Fist pick a rough set of candidates based on two subsets: 
1. Biological background info
1. Highest and lowest physiology in first 24 hours of ICU stay 

then to refine the list through EDA.  

In [23]:
data_cardarr.columns.values

array(['ïsiteid', 'hospitalclassificationid', 'hospitalclassification',
       'publicprivateid', 'publicprivate', 'patientid', 'icuadmityyyymm',
       'icuadmitfinyr', 'icuadmityyyy', 'icuadmitmonth',
       'icuadmitweekday', 'icuadmithour', 'icu_ad_dtm', 'icudisyyyy',
       'icudismonth', 'icudisweekday', 'icudishour', 'icu_ds_dtm',
       'hosp_ad_dtm', 'hosp_ds_dtm', 'hosp_srce', 'hosp_outcm',
       'hosp_hrs', 'sex', 'prior_icu_ad_dtm', 'prior_icu_ds_dtm',
       'icu_srce', 'icu_hrs', 'pre_icu_hrs', 'icu_ds_dec_dtm',
       'discharge_delay_hrs', 'readmission_lag_hrs', 'icu_outcm',
       'readmitted', 'admepisode', 'died_icu', 'died_hosp', 'age',
       'elect', 'chr_resp', 'chr_cvs', 'chr_liv', 'chr_ren', 'immundis',
       'immunrx', 'aids', 'hepfail', 'lymphoma', 'metast', 'leukaem',
       'immunsup', 'cirrhos', 'iddm', 'caretype', 'weight', 'height',
       'preg_stat', 'thrombpro', 'emg_rsp_adm', 'treat_lmt', 'cardarrest',
       'resparrest', 'indigenous', 'arf', 'int

In [36]:
biobackground_vars = ['sex', 'age', 'chr_resp', 'chr_cvs', 'chr_liv', 'chr_ren', 'immundis',
                      'immunrx', 'aids', 'hepfail', 'lymphoma', 'metast', 'leukaem', 'immunsup',
                      'cirrhos', 'iddm', 'weight', 'height', 'preg_stat', 'elect',
                      'pre_icu_hrs', 'intubated', 'ventilated',
                      'arf', 'gcsverb', 'gcsmotor', 'gcseye', 'gcs', 'urineop']

In [37]:
twentyfour_hr_phys_vars = ['temphi', 'templo', 'hrhi', 'hrlo', 'rrhi', 'rrlo', 'systolichi',
                          'systoliclo', 'diastolichi', 'diastoliclo', 'maphi', 'maplo',
                          'nahi', 'nalo', 'khi', 'klo', 'hco3hi', 'hco3lo', 'creathi',
                          'creatlo', 'hcthi', 'hctlo', 'hmgnhi', 'hmgnlo', 'wcchi',
                          'wcclo', 'plathi', 'platlo', 'gluchi', 'gluclo']

In [38]:
extra_vars = ['apache3riskofdeath', 'anzrodriskofdeath', 'apache3score']

In [39]:
data_candidate = data_cardarr[biobackground_vars + twentyfour_hr_phys_vars + extra_vars + target_var]
data_candidate.shape

(39723, 63)

## Split into trainning and test set
Before doing any EDA on the data, it is best practice to split the data into trainning and test set, and only explore on the trainning set.  
But first check missing values in target variable `died_hosp`

In [40]:
data_candidate.died_hosp.isnull().sum()

157

In [41]:
data_candidate.died_hosp.isnull().sum() / data_candidate.shape[0] * 100

0.3952370163381416

There are 157 or .39% of instances missing target variable value. We'll drop these missing cases

In [42]:
previous_shape = data_candidate.shape[0]

data_candidate_clean = data_candidate.copy() # keep a copy of data_candidate
data_candidate_clean = data_candidate_clean.dropna(subset=['died_hosp'])

print('Rows removed: ', (previous_shape - data_candidate_clean.shape[0]))
print('New shape: ', data_candidate_clean.shape)

Rows removed:  157
New shape:  (39566, 63)


Now split the data into training and test set(90-10 split)

In [43]:
data_train, data_test = train_test_split(data_candidate_clean, test_size=.1, 
                                         stratify=data_candidate_clean.died_hosp, random_state=42)

Let's check to make sure the output variable is correctly stratified across train/test sets

In [44]:
pd.DataFrame({
    'original': data_candidate_clean.died_hosp.value_counts() / len(data_candidate_clean),
    'train': data_train.died_hosp.value_counts() / len(data_train),
    'test': data_test.died_hosp.value_counts() / len(data_test)})[['original', 'train', 'test']]

Unnamed: 0,original,train,test
0.0,0.544584,0.544581,0.544604
1.0,0.455416,0.455419,0.455396


In [45]:
len(data_train), len(data_test), len(data_test)/len(data_candidate_clean)

(35609, 3957, 0.100010109690138)

In [46]:
print(data_train.shape)
print(data_test.shape)

(35609, 63)
(3957, 63)


So the class label is split preserving the class proportions and is in 90-10 ratio  

## Save the datasets to file

In [47]:
# save the original cardiac data 
data_cardarr.to_csv('../data/cardiac_data_{}_rows.csv'.format(data_cardarr.shape[0]), index=False)

# save the background and 24-hr physiology data
data_candidate_clean.to_csv('../data/cardiac_data_background_and_24hr_phys_{}_rows.csv'.format(data_candidate_clean.shape[0]), index=False)

# save the training data
data_train.to_csv('../data/cardiac_data_train_{}_rows.csv'.format(data_train.shape[0]), index=False)

# save the test data
data_test.to_csv('../data/cardiac_data_test_{}_rows.csv'.format(data_test.shape[0]), index=False)