This notebook extracts the lab data and reformat them to have unified denomination. Then extract labels of interest: death and length of stay.

This code relies on [`MIMIC_extract`](https://github.com/MLforHealth/MIMIC_Extract) matching dictionary and the [MIMIC III](https://physionet.org/content/mimiciii/1.4/) data.
To begin, download the data and update the following variable `PATH`.

In [None]:
PATH = '/home/vincent/Desktop/Cambridge/Data/Mimic/' #'/home/vjemj2/rds/hpc-work/data/mimic/'

In [None]:
import numpy as np
import pandas as pd

# Lab data

### Mapping labs variables

In [None]:
mapping = pd.read_csv(PATH + 'itemid_to_variable_map.csv', index_col = 'ITEMID', dtype = {'ITEMID': int})

In [None]:
mapping

In [None]:
mapping = pd.read_csv(PATH + 'itemid_to_variable_map.csv', index_col = 'ITEMID', dtype = {'ITEMID': int})
mapping = mapping[(mapping['LEVEL2'] != '') &\
                  (mapping['COUNT'] > 0) &\
                  (mapping['STATUS'] == 'ready')
                 ]

In [None]:
mapping.head()

### Extract lab data

In [None]:
labs = pd.read_csv(PATH + 'LABEVENTS.csv', parse_dates = ['CHARTTIME'])

In [None]:
labs

In [None]:
# Select data and replace itemid with standard format
labs = labs[labs.ITEMID.isin(mapping.index)][['SUBJECT_ID', 'HADM_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM']]
labs['Lab'] = mapping['LEVEL1'].loc[labs['ITEMID']].values

In [None]:
labs.head()

# Labels

Read the patients' outcome and extract the temporal information for labelling the data

In [None]:
genderAge = pd.read_csv(PATH + 'PATIENTS.csv', usecols = ['SUBJECT_ID', 'GENDER', 'DOB'], parse_dates = ['DOB'])

In [None]:
admissions = pd.read_csv(PATH + 'ADMISSIONS.csv',  
                         usecols = ['SUBJECT_ID', 'HADM_ID', 'ADMISSION_TYPE', 'HOSPITAL_EXPIRE_FLAG',
                                    'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ETHNICITY', 'INSURANCE', 'DIAGNOSIS'],
                         parse_dates = ['ADMITTIME', 'DISCHTIME', 'DEATHTIME'])
admissions = admissions.merge(genderAge, on = 'SUBJECT_ID')

In [None]:
# Focus only on adults - Update 12.10.2021
removed_nan = admissions[['ADMITTIME', 'DOB']].dropna()
admissions['AGE'] = np.nan
admissions['AGE'][removed_nan.index] = [date.days for date in (removed_nan.ADMITTIME.dt.to_pydatetime() - removed_nan.DOB.dt.to_pydatetime())]
admissions.AGE /= 365

admissions = admissions[admissions.AGE > 18]

In [None]:
# Focus on last visits (as space between visit might change process)
admissions = admissions.loc[admissions['SUBJECT_ID'].drop_duplicates('last').index]

In [None]:
# Change times to hours since admission
admissions['LOS'] = admissions['DISCHTIME'] - admissions['ADMITTIME']
admissions['Death'] = admissions['DEATHTIME'] - admissions['ADMITTIME']

In [None]:
# Add temproral information
# Shift of 8 hours to have patients of weekend from 8 am on saturday to 8 am on monday
admissions['Day'] = (admissions['ADMITTIME'] + pd.to_timedelta('8 hours')).dt.weekday

In [None]:
# Create index for easier search
admissions = admissions.set_index('SUBJECT_ID')
admissions.head()

In [None]:
assert len(admissions.HADM_ID.unique()) == len(admissions), \
    "Different patients have the same HADM_ID, might be a problem for the rest of the code"

# Transformation labs

In [None]:
# Remove unecessary HADM_ID
labs = labs[labs.HADM_ID.isin(admissions.HADM_ID)]

In [None]:
# Change time event to time since admission
labs['Time'] = labs.CHARTTIME.values - admissions.ADMITTIME.loc[labs.SUBJECT_ID].values

In [None]:
labs.head()

# Reformat

Clean reformating of the dataframe for saving

In [None]:
labs.rename(columns = {"SUBJECT_ID": "Patient",
                       "VALUENUM": "Value"}, inplace = True)
labs = labs.reset_index(drop = True)[['Patient', 'Time', 'Lab', 'Value']]
labs.head()

In [None]:
admissions.rename_axis(index = "Patient", inplace = True)
admissions.head()

In [None]:
labs.to_csv('data/labs_all.csv', index = False)
admissions.to_csv('data/outcomes_all.csv')

# Selection First days

In [None]:
first_day = pd.to_timedelta('1 day')
admissions = admissions[admissions.LOS >= first_day]
labs = labs[(labs.Time < first_day) &\
              labs.Patient.isin(admissions.index)]

# Cleaning labs

In [None]:
# Remove duplicates: same test multiple time at the same time
labs = labs[~labs.set_index(['Patient', 'Time', 'Lab']).index.duplicated(keep = False)]

In [None]:
# Pivot to hae test as columns
labs = labs.pivot(index = ['Patient', 'Time'], columns = 'Lab')

# Change index to have days
labs.index = labs.index.set_levels(labs.index.get_level_values(1).total_seconds()/(3600.*24), 1)

In [None]:
# Keep patients with at least two measures during the 24 hours of admission
# New update for 24 selection => 1.7.2021 
one_measure = labs.index.to_frame(False).groupby('Patient').apply(lambda x: (x < 24).sum()).Time > 1
labs = labs[labs.index.get_level_values(0).isin(one_measure[one_measure].index)]

In [None]:
# Keep labs that at least 5% population has one
# New subselection => 6.7.2021
labs = labs[labs.columns[(labs.groupby('Patient').count()>1).mean() > 0.05]]
labs.head()

In [None]:
# Keep labs only 24 hours after admission
# Justification: medical process prior to admission might be really different
# # New subselection => 6.7.2021
labs = labs[labs.index.get_level_values('Time') >= 0]
labs.head()

In [None]:
# Remove empty lines
labs = labs.dropna(how = 'all')

# Clean outcomes

In [None]:
# Remove patients with no labs
admissions = admissions.loc[labs.index.get_level_values(0).unique()]

In [None]:
admissions['LOS'] = admissions['LOS'].dt.total_seconds() / (24 * 60 * 60)
admissions['Death'] = admissions['Death'].dt.total_seconds() / (24 * 60 * 60)
admissions['Remaining'] = (admissions.LOS.loc[labs.index.get_level_values(0)] - labs.index.get_level_values(1)).groupby('Patient').last()

# Save

Rename columns and save all the data and labels

In [None]:
labs.to_csv('data/labs_first_day.csv')
admissions.to_csv('data/outcomes_first_day.csv')