# Datasets for disease specific patients

In [1]:
import pandas as pd
import mimic_pipeline.utils as utils
import numpy as np

In [2]:
user = input("Enter your username: ")
password = input("Enter your password: ")
loader = utils.DataBaseLoader(user=user, password=password)

In [3]:
train = pd.read_csv("data/TRAIN-union-features-id-excluded-cmo.csv")
test = pd.read_csv("data/TEST-union-features-id-excluded-cmo.csv")
whole = pd.concat([train, test], axis=0)
whole.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,preiculos,age,gcs_min,mechvent,urineoutput,heartrate_min,heartrate_max,...,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max,hospital_expire_flag
0,88280,198866,254097,4777.0,43.0,15.0,0,3980.0,68.0,118.0,...,2,,,,,,,,,-1
1,97666,158220,287837,1.0,50.0,15.0,0,1050.0,77.0,120.0,...,1,,,,,,,,,-1
2,6688,189787,231393,238.0,38.0,15.0,1,2000.0,101.0,116.0,...,0,458.0,139.0,52.0,34.0,7.21,7.47,284.0,284.0,-1
3,90992,181692,207539,286.0,80.0,15.0,0,1145.0,46.0,76.0,...,0,168.0,80.0,41.0,27.0,7.36,7.42,,,-1
4,29446,121959,278581,1.0,75.0,13.0,0,3040.0,67.0,92.0,...,1,,,,,,,,,-1


In [4]:
whole.shape

(29265, 53)

In [5]:
disease_df = loader['disease_flag']

get disease specific flags

In [6]:
whole_flag_df = whole.merge(disease_df[['hadm_id', 'ami', 'heart_failure', 'akf', 'sepsis']], on='hadm_id', how='left')
print(whole_flag_df.shape)
whole_flag_df.head()

(29265, 57)


Unnamed: 0,subject_id,hadm_id,icustay_id,preiculos,age,gcs_min,mechvent,urineoutput,heartrate_min,heartrate_max,...,paco2_min,ph_min,ph_max,aado2_min,aado2_max,hospital_expire_flag,ami,heart_failure,akf,sepsis
0,88280,198866,254097,4777.0,43.0,15.0,0,3980.0,68.0,118.0,...,,,,,,-1,0,0,0,0
1,97666,158220,287837,1.0,50.0,15.0,0,1050.0,77.0,120.0,...,,,,,,-1,0,1,1,0
2,6688,189787,231393,238.0,38.0,15.0,1,2000.0,101.0,116.0,...,34.0,7.21,7.47,284.0,284.0,-1,0,0,0,0
3,90992,181692,207539,286.0,80.0,15.0,0,1145.0,46.0,76.0,...,27.0,7.36,7.42,,,-1,0,0,0,0
4,29446,121959,278581,1.0,75.0,13.0,0,3040.0,67.0,92.0,...,,,,,,-1,0,0,0,0


check stays are unique

In [7]:
len(whole_flag_df['hadm_id'].unique())

29265

In [8]:
for label in ['ami', 'sepsis', 'akf', 'heart_failure']:
    print(whole_flag_df[label].unique())
    print(whole_flag_df[label].isna().sum())

[0 1]
0
[0 1]
0
[0 1]
0
[0 1]
0


## Acute Myocardial Infarction (AMI)

In [9]:
ami_df = whole_flag_df[whole_flag_df['ami'] == 1]
print(ami_df.shape)
print(f"Percentage of patients: {len(ami_df)/len(whole_flag_df)*100:.1f}%")
print(f"Mortality Rate: {ami_df['hospital_expire_flag'].replace({-1: 0}).mean()*100:.1f}%")

(3494, 57)
Percentage of patients: 11.9%
Mortality Rate: 9.0%


In [10]:
assert ami_df['ami'].unique() == [1]
ami_df = ami_df.drop(['heart_failure', 'akf', 'sepsis', 'ami'], axis=1)
ami_df.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,preiculos,age,gcs_min,mechvent,urineoutput,heartrate_min,heartrate_max,...,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max,hospital_expire_flag
14,80029,187592,203031,2.0,73.0,15.0,1,770.0,72.0,99.0,...,2,54.0,54.0,51.0,51.0,7.35,7.35,,,-1
27,46550,131324,248697,1.0,86.0,15.0,1,2827.0,55.0,81.0,...,1,350.0,121.0,52.0,36.0,7.37,7.52,,,-1
31,57342,113484,237507,325.0,67.0,13.0,0,3430.0,88.0,107.0,...,1,33.0,33.0,49.0,49.0,7.41,7.41,,,-1
36,3182,192079,299813,528.0,75.0,14.0,0,840.0,51.0,78.0,...,1,161.0,39.0,53.0,45.0,7.25,7.37,,,1
43,98959,116420,279690,1.0,77.0,15.0,0,2050.0,77.0,111.0,...,1,,,,,,,,,-1


In [11]:
ami_df.to_csv("data/mimic-disease/ami-union-features-id-excluded-cmo.csv", index=False)
ami_df = ami_df.drop(['subject_id', 'hadm_id', 'icustay_id'], axis=1)
ami_df.head()

Unnamed: 0,preiculos,age,gcs_min,mechvent,urineoutput,heartrate_min,heartrate_max,meanbp_min,meanbp_max,resprate_min,...,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max,hospital_expire_flag
14,2.0,73.0,15.0,1,770.0,72.0,99.0,47.0,99.0,12.0,...,2,54.0,54.0,51.0,51.0,7.35,7.35,,,-1
27,1.0,86.0,15.0,1,2827.0,55.0,81.0,55.0,92.0,13.0,...,1,350.0,121.0,52.0,36.0,7.37,7.52,,,-1
31,325.0,67.0,13.0,0,3430.0,88.0,107.0,40.0,108.0,17.0,...,1,33.0,33.0,49.0,49.0,7.41,7.41,,,-1
36,528.0,75.0,14.0,0,840.0,51.0,78.0,48.333302,125.333,13.0,...,1,161.0,39.0,53.0,45.0,7.25,7.37,,,1
43,1.0,77.0,15.0,0,2050.0,77.0,111.0,63.0,136.0,13.0,...,1,,,,,,,,,-1


In [12]:
ami_df.to_csv("data/mimic-disease/ami-union-features-excluded-cmo.csv", index=False)

## Sepsis/septicemia

In [13]:
sepsis_df = whole_flag_df[whole_flag_df['sepsis'] == 1]
print(sepsis_df.shape)
print(f"Percentage of patients: {len(sepsis_df)/len(whole_flag_df)*100:.1f}%")
print(f"Mortality Rate: {sepsis_df['hospital_expire_flag'].replace({-1: 0}).mean()*100:.1f}%")

(3192, 57)
Percentage of patients: 10.9%
Mortality Rate: 22.9%


In [14]:
assert sepsis_df['sepsis'].unique() == [1]
sepsis_df = sepsis_df.drop(['heart_failure', 'akf', 'ami', 'sepsis'], axis=1)
sepsis_df.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,preiculos,age,gcs_min,mechvent,urineoutput,heartrate_min,heartrate_max,...,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max,hospital_expire_flag
6,57795,108833,223402,285.0,46.0,10.0,0,4190.0,61.0,75.0,...,1,76.0,76.0,43.0,43.0,7.37,7.37,,,-1
10,24711,155303,295466,2.0,80.0,15.0,1,1045.0,54.0,126.0,...,1,203.0,52.0,38.0,37.0,7.52,7.53,629.0,629.0,-1
15,82512,169761,259199,2780.0,71.0,15.0,1,800.0,78.0,97.0,...,1,405.0,127.0,45.0,32.0,7.43,7.53,,,1
20,4588,153334,298426,1.0,37.0,15.0,1,1608.0,100.0,131.0,...,1,158.0,64.0,54.0,25.0,7.09,7.47,398.0,555.0,-1
25,74937,146708,269115,2.0,50.0,14.0,1,450.0,111.0,156.0,...,1,217.0,44.0,47.0,30.0,6.98,7.35,469.0,645.0,1


In [15]:
sepsis_df.to_csv("data/mimic-disease/sepsis-union-features-id-excluded-cmo.csv", index=False)
sepsis_df = sepsis_df.drop(['subject_id', 'hadm_id', 'icustay_id'], axis=1)
sepsis_df.head()

Unnamed: 0,preiculos,age,gcs_min,mechvent,urineoutput,heartrate_min,heartrate_max,meanbp_min,meanbp_max,resprate_min,...,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max,hospital_expire_flag
6,285.0,46.0,10.0,0,4190.0,61.0,75.0,52.0,86.0,10.0,...,1,76.0,76.0,43.0,43.0,7.37,7.37,,,-1
10,2.0,80.0,15.0,1,1045.0,54.0,126.0,55.666698,92.0,10.0,...,1,203.0,52.0,38.0,37.0,7.52,7.53,629.0,629.0,-1
15,2780.0,71.0,15.0,1,800.0,78.0,97.0,53.0,84.0,3.0,...,1,405.0,127.0,45.0,32.0,7.43,7.53,,,1
20,1.0,37.0,15.0,1,1608.0,100.0,131.0,57.0,91.0,7.0,...,1,158.0,64.0,54.0,25.0,7.09,7.47,398.0,555.0,-1
25,2.0,50.0,14.0,1,450.0,111.0,156.0,17.0,179.0,4.0,...,1,217.0,44.0,47.0,30.0,6.98,7.35,469.0,645.0,1


In [16]:
sepsis_df.to_csv("data/mimic-disease/sepsis-union-features-excluded-cmo.csv", index=False)

## Heart Failure

In [17]:
heart_failure_df = whole_flag_df[whole_flag_df['heart_failure'] == 1]
print(heart_failure_df.shape)
print(f"Percentage of patients: {len(heart_failure_df)/len(whole_flag_df)*100:.1f}%")
print(f"Mortality Rate: {heart_failure_df['hospital_expire_flag'].replace({-1: 0}).mean()*100:.1f}%")

(6708, 57)
Percentage of patients: 22.9%
Mortality Rate: 10.4%


In [18]:
assert heart_failure_df['heart_failure'].unique() == [1]
heart_failure_df = heart_failure_df.drop(['sepsis', 'akf', 'ami', 'heart_failure'], axis=1)
heart_failure_df.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,preiculos,age,gcs_min,mechvent,urineoutput,heartrate_min,heartrate_max,...,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max,hospital_expire_flag
1,97666,158220,287837,1.0,50.0,15.0,0,1050.0,77.0,120.0,...,1,,,,,,,,,-1
5,72666,186402,266459,1.0,83.0,14.0,0,545.0,60.0,76.0,...,1,,,,,,,,,-1
11,22157,198698,215724,1074.0,78.0,14.0,0,442.0,100.0,119.0,...,1,28.0,28.0,52.0,52.0,7.31,7.31,,,-1
14,80029,187592,203031,2.0,73.0,15.0,1,770.0,72.0,99.0,...,2,54.0,54.0,51.0,51.0,7.35,7.35,,,-1
15,82512,169761,259199,2780.0,71.0,15.0,1,800.0,78.0,97.0,...,1,405.0,127.0,45.0,32.0,7.43,7.53,,,1


In [19]:
heart_failure_df.to_csv("data/mimic-disease/heart_failure-union-features-id-excluded-cmo.csv", index=False)
heart_failure_df = heart_failure_df.drop(['subject_id', 'hadm_id', 'icustay_id'], axis=1)
heart_failure_df.head()

Unnamed: 0,preiculos,age,gcs_min,mechvent,urineoutput,heartrate_min,heartrate_max,meanbp_min,meanbp_max,resprate_min,...,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max,hospital_expire_flag
1,1.0,50.0,15.0,0,1050.0,77.0,120.0,56.0,119.0,9.0,...,1,,,,,,,,,-1
5,1.0,83.0,14.0,0,545.0,60.0,76.0,46.0,151.0,14.0,...,1,,,,,,,,,-1
11,1074.0,78.0,14.0,0,442.0,100.0,119.0,52.0,111.333,11.0,...,1,28.0,28.0,52.0,52.0,7.31,7.31,,,-1
14,2.0,73.0,15.0,1,770.0,72.0,99.0,47.0,99.0,12.0,...,2,54.0,54.0,51.0,51.0,7.35,7.35,,,-1
15,2780.0,71.0,15.0,1,800.0,78.0,97.0,53.0,84.0,3.0,...,1,405.0,127.0,45.0,32.0,7.43,7.53,,,1


In [20]:
heart_failure_df.to_csv("data/mimic-disease/heart_failure-union-features-excluded-cmo.csv", index=False)

## Acute Kidney Failure

In [21]:
akf_df = whole_flag_df[whole_flag_df['akf'] == 1]
print(akf_df.shape)
print(f"Percentage of patients: {len(akf_df)/len(whole_flag_df)*100:.1f}%")
print(f"Mortality Rate: {akf_df['hospital_expire_flag'].replace({-1: 0}).mean()*100:.1f}%")

(5999, 57)
Percentage of patients: 20.5%
Mortality Rate: 17.7%


In [22]:
assert akf_df['akf'].unique() == [1]
akf_df = akf_df.drop(['sepsis', 'heart_failure', 'ami', 'akf'], axis=1)
akf_df.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,preiculos,age,gcs_min,mechvent,urineoutput,heartrate_min,heartrate_max,...,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max,hospital_expire_flag
1,97666,158220,287837,1.0,50.0,15.0,0,1050.0,77.0,120.0,...,1,,,,,,,,,-1
5,72666,186402,266459,1.0,83.0,14.0,0,545.0,60.0,76.0,...,1,,,,,,,,,-1
11,22157,198698,215724,1074.0,78.0,14.0,0,442.0,100.0,119.0,...,1,28.0,28.0,52.0,52.0,7.31,7.31,,,-1
15,82512,169761,259199,2780.0,71.0,15.0,1,800.0,78.0,97.0,...,1,405.0,127.0,45.0,32.0,7.43,7.53,,,1
25,74937,146708,269115,2.0,50.0,14.0,1,450.0,111.0,156.0,...,1,217.0,44.0,47.0,30.0,6.98,7.35,469.0,645.0,1


In [23]:
akf_df.to_csv("data/mimic-disease/akf-union-features-id-excluded-cmo.csv", index=False)
akf_df = akf_df.drop(['subject_id', 'hadm_id', 'icustay_id'], axis=1)
akf_df.head()

Unnamed: 0,preiculos,age,gcs_min,mechvent,urineoutput,heartrate_min,heartrate_max,meanbp_min,meanbp_max,resprate_min,...,admissiontype,pao2_max,pao2_min,paco2_max,paco2_min,ph_min,ph_max,aado2_min,aado2_max,hospital_expire_flag
1,1.0,50.0,15.0,0,1050.0,77.0,120.0,56.0,119.0,9.0,...,1,,,,,,,,,-1
5,1.0,83.0,14.0,0,545.0,60.0,76.0,46.0,151.0,14.0,...,1,,,,,,,,,-1
11,1074.0,78.0,14.0,0,442.0,100.0,119.0,52.0,111.333,11.0,...,1,28.0,28.0,52.0,52.0,7.31,7.31,,,-1
15,2780.0,71.0,15.0,1,800.0,78.0,97.0,53.0,84.0,3.0,...,1,405.0,127.0,45.0,32.0,7.43,7.53,,,1
25,2.0,50.0,14.0,1,450.0,111.0,156.0,17.0,179.0,4.0,...,1,217.0,44.0,47.0,30.0,6.98,7.35,469.0,645.0,1


In [24]:
akf_df.to_csv("data/mimic-disease/akf-union-features-excluded-cmo.csv", index=False)