This notebook extracts the lab data and reformat them to have unified denomination. Then extract labels of interest: death and length of stay.
To begin, download the data and update the following variable `PATH`.

In [1]:
PATH = '/home/vincent/Desktop/Cambridge/Data/eICU/'

In [2]:
import numpy as np
import pandas as pd

# Lab data

### Extract lab data

In [3]:
labs = pd.read_csv(PATH + 'lab.csv', usecols = ['patientunitstayid', 'labresultoffset', 'labname', 'labresult']).dropna()
labs.labresultoffset = labs.labresultoffset / (60 * 24) # Change to days

In [4]:
labs

Unnamed: 0,patientunitstayid,labresultoffset,labname,labresult
0,141168,1.406944,fibrinogen,177.00
1,141168,0.786806,PT - INR,2.50
2,141168,1.406944,magnesium,2.00
3,141168,0.786806,PT,26.60
4,141168,1.486806,pH,7.20
...,...,...,...,...
39132526,3353263,-0.004861,WBC x 1000,6.40
39132527,3353263,1.203472,RBC,4.67
39132528,3353263,-0.004861,-monos,10.00
39132529,3353263,1.203472,WBC x 1000,6.60


# Labels

Read the patients' outcome and extract the temporal information for labelling the data

In [5]:
admissions = pd.read_csv(PATH + 'patient.csv')
hospitals = pd.read_csv(PATH + 'hospital.csv')

In [6]:
# Ensure only one hospital stay per patients (take the first one with first ICU visit)
admissions = admissions.sort_values(['hospitaldischargeyear','unitvisitnumber'], ascending = True).groupby('uniquepid').first()

In [7]:
admissions

Unnamed: 0_level_0,patientunitstayid,patienthealthsystemstayid,gender,age,ethnicity,hospitalid,wardid,apacheadmissiondx,admissionheight,hospitaladmittime24,...,unitadmittime24,unitadmitsource,unitvisitnumber,unitstaytype,admissionweight,dischargeweight,unitdischargetime24,unitdischargeoffset,unitdischargelocation,unitdischargestatus
uniquepid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002-10009,224606,193705,Female,76,Caucasian,71,87,"GI perforation/rupture, surgery for",160.0,00:44:00,...,03:43:00,Operating Room,1,admit,,56.9,01:29:00,4186,Floor,Alive
002-10018,204602,178200,Female,29,Caucasian,66,90,"Cardiovascular medical, other",162.6,18:02:00,...,02:56:00,Floor,1,admit,88.5,91.7,22:52:00,1196,Floor,Alive
002-10034,157016,141169,Female,23,Caucasian,63,95,"GI medical, other",162.6,14:36:00,...,22:07:00,Floor,1,admit,63.5,63.5,19:39:00,4172,Home,Alive
002-10050,211144,183274,Female,67,Caucasian,71,87,Aortic valve replacement (isolated),160.0,22:19:00,...,23:26:00,Operating Room,1,admit,86.2,89.3,19:45:00,5539,Floor,Alive
002-10052,151900,137239,Female,66,Caucasian,73,97,"Sepsis, pulmonary",165.1,10:02:00,...,10:25:00,Emergency Department,1,admit,86.8,79.0,20:05:00,3460,Floor,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
035-9957,3351785,2741786,Male,74,Caucasian,458,1109,Head only trauma,182.9,02:17:00,...,08:27:00,Emergency Department,1,admit,75.7,75.7,21:15:00,2208,Step-Down Unit (SDU),Alive
035-9959,3340321,2731423,Male,44,Caucasian,458,1109,"Infarction, acute myocardial (MI)",185.4,21:57:00,...,21:57:00,Direct Admit,1,admit,130.6,130.6,14:30:00,3873,Home,Alive
035-996,3345874,2736458,Male,55,African American,458,1109,Rhythm disturbance (conduction defect),190.5,19:56:00,...,21:09:00,Emergency Department,1,admit,165.9,169.2,19:10:00,2761,Other Hospital,Alive
035-9966,3352628,2742533,Male,60,African American,458,1106,"Apnea-sleep; surgery for (i.e., UPPP - uvulopa...",170.1,10:28:00,...,14:49:00,Operating Room,1,admit,120.2,120.2,22:01:00,1872,Telemetry,Alive


In [8]:
# Joint on hospitals (get the hospital status and regions)
admissions = admissions.merge(hospitals, on = 'hospitalid')

In [9]:
# Focus only on adults
admissions = admissions[admissions.age.replace({"> 89": '100'}).astype(float) > 18]

In [10]:
# Match MIMIC format
admissions['Death'] = admissions.hospitaldischargestatus == 'Expired'
admissions['LOS'] = admissions.hospitaldischargeoffset / (60 * 24) # Change to days

In [11]:
assert len(admissions.patientunitstayid.unique()) == len(admissions), \
    "Different patients have the same HADM_ID, might be a problem for the rest of the code"

# Transformation labs

In [12]:
# Remove unecessary patientunitstayid
labs = labs[labs.patientunitstayid.isin(admissions.patientunitstayid)]

In [13]:
labs.head()

Unnamed: 0,patientunitstayid,labresultoffset,labname,labresult
137,141178,-0.194444,salicylate,2.3
138,141178,-0.194444,RDW,11.9
139,141178,-0.194444,total protein,7.4
140,141178,-0.194444,MCV,94.8
142,141178,-0.194444,-polys,45.0


# Reformat

Clean reformating of the dataframe for saving

In [14]:
labs.rename(columns = {"patientunitstayid": "Patient",
                       "labresult": "Value",
                       "labname": "Lab",
                       "labresultoffset": "Time"}, inplace = True)
labs = labs.reset_index(drop = True)[['Patient', 'Time', 'Lab', 'Value']]
labs.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,Patient,Time,Lab,Value
0,141178,-0.194444,salicylate,2.3
1,141178,-0.194444,RDW,11.9
2,141178,-0.194444,total protein,7.4
3,141178,-0.194444,MCV,94.8
4,141178,-0.194444,-polys,45.0


In [15]:
admissions = admissions.set_index("patientunitstayid").rename_axis(index = "Patient")
admissions.head()

Unnamed: 0_level_0,patienthealthsystemstayid,gender,age,ethnicity,hospitalid,wardid,apacheadmissiondx,admissionheight,hospitaladmittime24,hospitaladmitoffset,...,dischargeweight,unitdischargetime24,unitdischargeoffset,unitdischargelocation,unitdischargestatus,numbedscategory,teachingstatus,region,Death,LOS
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
224606,193705,Female,76,Caucasian,71,87,"GI perforation/rupture, surgery for",160.0,00:44:00,-179,...,56.9,01:29:00,4186,Floor,Alive,100 - 249,f,Midwest,False,5.682639
211144,183274,Female,67,Caucasian,71,87,Aortic valve replacement (isolated),160.0,22:19:00,-67,...,89.3,19:45:00,5539,Floor,Alive,100 - 249,f,Midwest,False,6.734028
204935,178462,Female,83,Caucasian,71,87,"CHF, congestive heart failure",162.6,08:41:00,-2156,...,86.5,08:40:00,723,Floor,Alive,100 - 249,f,Midwest,True,1.10625
217838,188447,Female,57,Caucasian,71,113,"Hip replacement, total (non-traumatic)",154.9,20:47:03,-1,...,96.1,21:37:00,49,Step-Down Unit (SDU),Alive,100 - 249,f,Midwest,False,2.977778
172414,153075,Female,48,Caucasian,71,113,Drug withdrawal,172.7,00:01:00,-1,...,63.4,22:42:00,1360,Floor,Alive,100 - 249,f,Midwest,False,1.727778


In [16]:
labs.to_csv('data/eicu/labs_all.csv', index = False)
admissions.to_csv('data/eicu/outcomes_all.csv')

# Selection First days

In [17]:
admissions = admissions[admissions.LOS >= 1]
labs = labs[(labs.Time < 1) &\
              labs.Patient.isin(admissions.index)] # Select first day of data

# Cleaning labs

In [18]:
# Remove duplicates: same test multiple time at the same time
labs = labs[~labs.set_index(['Patient', 'Time', 'Lab']).index.duplicated(keep = False)]

In [19]:
# Pivot to hae test as columns
labs = labs.pivot(index = ['Patient', 'Time'], columns = 'Lab')

In [20]:
# Keep labs that at least 5% population has one
# New subselection => 6.7.2021
labs = labs[labs.columns[(labs.groupby('Patient').count()>1).mean() > 0.05]]
labs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Lab,-basos,-eos,-lymphs,-monos,-polys,ALT (SGPT),AST (SGOT),BUN,Base Deficit,Base Excess,...,pH,paCO2,paO2,phosphate,platelets x 1000,potassium,sodium,total bilirubin,total protein,troponin - I
Patient,Time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
141178,-0.194444,0.0,3.0,45.0,7.0,45.0,52.0,40.0,11.0,,,...,,,,,273.0,3.6,146.0,0.4,7.4,
141194,-0.640972,0.0,1.0,6.0,8.0,85.0,19.0,15.0,41.0,,,...,,,,,298.0,4.6,134.0,0.4,8.0,
141194,-0.638194,,,,,,,,,,,...,,,,,,,,,,
141194,-0.609722,,,,,,,,,,,...,,,,,,,,,,
141194,-0.1375,,,,,,,,,,,...,,,,,,,,,,


In [21]:
# Keep labs only 24 hours after admission
# Justification: medical process prior to admission might be really different
# # New subselection => 6.7.2021
labs = labs[labs.index.get_level_values('Time') >= 0]
labs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Lab,-basos,-eos,-lymphs,-monos,-polys,ALT (SGPT),AST (SGOT),BUN,Base Deficit,Base Excess,...,pH,paCO2,paO2,phosphate,platelets x 1000,potassium,sodium,total bilirubin,total protein,troponin - I
Patient,Time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
141194,0.215278,,,,,,,,,,,...,,,,,,,,,,
141194,0.216667,0.0,0.0,1.0,1.0,98.0,20.0,24.0,31.0,,,...,,,,,233.0,4.0,135.0,0.4,6.9,
141194,0.461806,,,,,,,,,,,...,,,,,,,,,,
141194,0.463194,,,,,,,,,,,...,,,,,,,,,,
141194,0.529167,,,,,,,,30.0,,,...,,,,,,3.7,134.0,,,


In [22]:
# Remove empty lines
labs = labs.dropna(how = 'all')

# Clean outcomes

In [23]:
# Remove patients with no labs
admissions = admissions.loc[labs.index.get_level_values(0).unique()]

In [24]:
admissions['Remaining'] = (admissions.LOS.loc[labs.index.get_level_values(0)] - labs.index.get_level_values(1)).groupby('Patient').last()

# Save

Rename columns and save all the data and labels

In [25]:
labs.to_csv('data/eicu/labs_first_day.csv')
admissions.to_csv('data/eicu/outcomes_first_day.csv')