# Generate Dataset for diseases

In [1]:
import mimic_pipeline.utils as utils
import pandas as pd

In [2]:
user = input("Enter your username: ")
password = input("Enter your password: ")
loader = utils.DataBaseLoader(user=user, password=password, dbname='eicu', schema='eicu')

In [3]:
eicu_df = pd.read_csv("data/eICU-union-excluded-cmo.csv")
print(eicu_df.shape)
eicu_df.head()

(106228, 56)


Unnamed: 0,uniquepid,patientunitstayid,apache_iv_prob,apache_iva_prob,oasis_prob,sapsii_prob,hospital_expire_flag,preiculos,age,gcs_min,...,pao2fio2_vent_min,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max
0,002-10034,157016,0.022028,0.025522,0.008392,0.004584,0,3331,23,,...,,1,,,,,,,,
1,002-10063,218742,0.051189,0.032654,0.042509,0.052195,0,2,69,13.0,...,,1,,,,,,,,
2,002-10066,214497,0.061772,0.049788,0.137099,0.680887,0,18,42,8.0,...,,1,,,,,,,,
3,002-10079,151179,0.868431,0.6852,0.077479,0.460662,1,1,59,,...,,1,91.0,91.0,26.0,26.0,7.23,7.23,82.64,82.64
4,002-1010,174826,0.034445,0.027754,0.02293,0.071716,0,34,64,,...,,1,,,,,,,,


In [4]:
disease_df = loader['disease_flag']

In [5]:
disease_df.shape

(173109, 5)

In [6]:
disease_df.head()

Unnamed: 0,patientunitstayid,sepsis,ami,heart_failure,akf
0,141168,0,0,1,0
1,141203,0,0,0,0
2,141227,1,0,0,0
3,141229,0,0,1,0
4,141266,0,1,1,1


get disease specific flags

In [7]:
whole_flag_df = eicu_df.merge(disease_df[['patientunitstayid', 'ami', 'heart_failure', 'akf', 'sepsis']], on='patientunitstayid', how='left')
whole_flag_df.shape

(106228, 60)

In [8]:
len(whole_flag_df['patientunitstayid'].unique())

106228

In [9]:
for label in ['ami', 'sepsis', 'akf', 'heart_failure']:
    print(disease_df[label].unique())

[0 1]
[0 1]
[0 1]
[1 0]


In [10]:
for label in ['ami', 'sepsis', 'akf', 'heart_failure']:
    print(whole_flag_df[label].unique())
    print(whole_flag_df[label].isna().sum())

[ 0. nan  1.]
3657
[ 0. nan  1.]
3657
[ 0. nan  1.]
3657
[ 0. nan  1.]
3657


## Acute Myocardial Infarction (AMI)

In [11]:
ami_df = whole_flag_df[whole_flag_df['ami'] == 1]
print(ami_df.shape)
print(f"Percentage of patients: {len(ami_df)/len(whole_flag_df)*100:.1f}%")
print(f"Mortality Rate: {ami_df['hospital_expire_flag'].replace({-1: 0}).mean()*100:.1f}%")

(6501, 60)
Percentage of patients: 6.1%
Mortality Rate: 7.8%


In [12]:
assert ami_df['ami'].unique() == [1]
ami_df = ami_df.drop(['heart_failure', 'akf', 'sepsis', 'ami'], axis=1)
ami_df.head()

Unnamed: 0,uniquepid,patientunitstayid,apache_iv_prob,apache_iva_prob,oasis_prob,sapsii_prob,hospital_expire_flag,preiculos,age,gcs_min,...,pao2fio2_vent_min,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max
53,002-11869,181829,0.010858,0.010636,0.013897,0.015163,0,84,53,15.0,...,,1,,,,,,,,
73,002-12548,234190,0.529404,0.465268,0.188911,0.437547,1,1,75,,...,148.75,1,215.0,119.0,56.0,35.0,7.15,7.43,148.5,431.7
78,002-12747,181665,0.021882,0.019303,0.017861,0.019976,0,0,50,15.0,...,,1,,,,,,,,
86,002-12928,157416,0.029328,0.017637,0.054187,0.064649,0,1,60,,...,,1,,,,,,,,
89,002-12983,216223,0.083992,0.067442,0.037612,0.087706,0,15,69,15.0,...,,1,,,,,,,,


In [13]:
ami_df.to_csv("data/eicu-disease/ami-union-features-id-no-cmo.csv", index=False)
ami_df.head()

Unnamed: 0,uniquepid,patientunitstayid,apache_iv_prob,apache_iva_prob,oasis_prob,sapsii_prob,hospital_expire_flag,preiculos,age,gcs_min,...,pao2fio2_vent_min,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max
53,002-11869,181829,0.010858,0.010636,0.013897,0.015163,0,84,53,15.0,...,,1,,,,,,,,
73,002-12548,234190,0.529404,0.465268,0.188911,0.437547,1,1,75,,...,148.75,1,215.0,119.0,56.0,35.0,7.15,7.43,148.5,431.7
78,002-12747,181665,0.021882,0.019303,0.017861,0.019976,0,0,50,15.0,...,,1,,,,,,,,
86,002-12928,157416,0.029328,0.017637,0.054187,0.064649,0,1,60,,...,,1,,,,,,,,
89,002-12983,216223,0.083992,0.067442,0.037612,0.087706,0,15,69,15.0,...,,1,,,,,,,,


## Sepsis/Septicemia

In [14]:
sepsis_df = whole_flag_df[whole_flag_df['sepsis'] == 1]
print(sepsis_df.shape)
print(f"Percentage of patients: {len(sepsis_df)/len(whole_flag_df)*100:.1f}%")
print(f"Mortality Rate: {sepsis_df['hospital_expire_flag'].replace({-1: 0}).mean()*100:.1f}%")

(13254, 60)
Percentage of patients: 12.5%
Mortality Rate: 16.6%


In [15]:
assert sepsis_df['sepsis'].unique() == [1]
sepsis_df = sepsis_df.drop(['heart_failure', 'akf', 'ami', 'sepsis'], axis=1)
sepsis_df.head()

Unnamed: 0,uniquepid,patientunitstayid,apache_iv_prob,apache_iva_prob,oasis_prob,sapsii_prob,hospital_expire_flag,preiculos,age,gcs_min,...,pao2fio2_vent_min,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max
3,002-10079,151179,0.868431,0.6852,0.077479,0.460662,1,1,59,,...,,1,91.0,91.0,26.0,26.0,7.23,7.23,82.64,82.64
10,002-10317,241683,0.261525,0.211771,0.037612,0.196366,0,16,40,9.0,...,,1,68.0,68.0,27.0,27.0,7.48,7.48,154.55,154.55
18,002-10549,215978,0.151991,0.144364,0.068843,0.18102,0,0,66,13.0,...,,1,102.0,102.0,31.0,31.0,7.4,7.4,223.5,223.5
25,002-10882,205928,0.125635,0.086084,0.061106,0.064649,0,3257,56,,...,,1,,,,,,,,
37,002-11201,207434,0.127025,0.099782,0.042509,0.196366,0,12,86,15.0,...,,1,64.0,64.0,42.0,42.0,7.45,7.45,607.0,607.0


In [16]:
sepsis_df.to_csv("data/eicu-disease/sepsis-union-features-id-no-cmo.csv", index=False)

## Heart Failure

In [17]:
heart_failure_df = whole_flag_df[whole_flag_df['heart_failure'] == 1]
print(heart_failure_df.shape)
print(f"Percentage of patients: {len(heart_failure_df)/len(whole_flag_df)*100:.1f}%")
print(f"Mortality Rate: {heart_failure_df['hospital_expire_flag'].replace({-1: 0}).mean()*100:.1f}%")

(8358, 60)
Percentage of patients: 7.9%
Mortality Rate: 11.9%


In [18]:
assert heart_failure_df['heart_failure'].unique() == [1]
heart_failure_df = heart_failure_df.drop(['sepsis', 'akf', 'ami', 'heart_failure'], axis=1)
heart_failure_df.head()

Unnamed: 0,uniquepid,patientunitstayid,apache_iv_prob,apache_iva_prob,oasis_prob,sapsii_prob,hospital_expire_flag,preiculos,age,gcs_min,...,pao2fio2_vent_min,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max
38,002-1127,200581,0.030225,0.016302,0.008392,0.022778,0,672,77,14.0,...,,0,,,,,,,,
73,002-12548,234190,0.529404,0.465268,0.188911,0.437547,1,1,75,,...,148.75,1,215.0,119.0,56.0,35.0,7.15,7.43,148.5,431.7
74,002-12588,213211,0.032468,0.036961,0.054187,0.029295,0,4,44,15.0,...,,1,,,,,,,,
83,002-12876,236992,0.083175,0.075452,0.061106,0.196366,0,1,81,15.0,...,,1,71.0,71.0,43.0,43.0,7.43,7.43,99.9,99.9
96,002-13226,180654,0.182531,0.104143,0.152892,0.266087,0,1,81,15.0,...,,1,,,,,,,,


In [19]:
heart_failure_df.to_csv("data/eicu-disease/heart_failure-union-features-id-no-cmo.csv", index=False)

## Acute Kidney Failure

In [20]:
akf_df = whole_flag_df[whole_flag_df['akf'] == 1]
print(akf_df.shape)
print(f"Percentage of patients: {len(akf_df)/len(whole_flag_df)*100:.1f}%")
print(f"Mortality Rate: {akf_df['hospital_expire_flag'].replace({-1: 0}).mean()*100:.1f}%")

(10877, 60)
Percentage of patients: 10.2%
Mortality Rate: 17.3%


In [21]:
assert akf_df['akf'].unique() == [1]
akf_df = akf_df.drop(['sepsis', 'heart_failure', 'ami', 'akf'], axis=1)
akf_df.head()

Unnamed: 0,uniquepid,patientunitstayid,apache_iv_prob,apache_iva_prob,oasis_prob,sapsii_prob,hospital_expire_flag,preiculos,age,gcs_min,...,pao2fio2_vent_min,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max
18,002-10549,215978,0.151991,0.144364,0.068843,0.18102,0,0,66,13.0,...,,1,102.0,102.0,31.0,31.0,7.4,7.4,223.5,223.5
24,002-10804,223303,0.007191,0.005387,0.042509,0.019976,0,5,32,14.0,...,,1,,,,,,,,
25,002-10882,205928,0.125635,0.086084,0.061106,0.064649,0,3257,56,,...,,1,,,,,,,,
27,002-10997,173902,0.008308,0.017539,0.009522,0.019976,0,114,64,15.0,...,,0,233.0,206.0,40.0,36.0,7.33,7.34,444.0,470.0
43,002-1146,237899,0.484411,0.469543,0.061106,0.285486,1,9498,54,,...,208.0,1,265.0,90.0,62.0,34.0,7.11,7.41,190.5,396.0


In [22]:
akf_df.to_csv("data/eicu-disease/akf-union-features-id-no-cmo.csv", index=False)