This notebook extracts the lab data and reformat them to have unified denomination. Then extract labels of interest: death and length of stay.

This code relies on [`MIMIC_extract`](https://github.com/MLforHealth/MIMIC_Extract) matching dictionary and the [MIMIC III](https://physionet.org/content/mimiciii/1.4/) data.
To begin, download the data and update the following variable `PATH`.

In [None]:
PATH = '/home/vincent/Desktop/Cambridge/Data/Mimic/' #'/home/vjemj2/rds/hpc-work/data/mimic/'

In [1]:
import sys
sys.path.append('../')
import numpy as np
import pandas as pd

# Lab data

### Mapping labs variables

In [None]:
mapping = pd.read_csv(PATH + 'itemid_to_variable_map.csv', index_col = 'ITEMID', dtype = {'ITEMID': int})
mapping = mapping[(mapping['LEVEL2'] != '') &\
                  (mapping['COUNT'] > 0) &\
                  (mapping['STATUS'] == 'ready')
                 ]

In [None]:
mapping.head()

### Extract lab data

In [None]:
labs = pd.read_csv(PATH + 'LABEVENTS.csv', parse_dates = ['CHARTTIME'])

In [None]:
# Select data and replace itemid with standard format
labs = labs[labs.ITEMID.isin(mapping.index)][['SUBJECT_ID', 'HADM_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM']]
labs['Lab'] = mapping['LEVEL1'].loc[labs['ITEMID']].values

In [None]:
labs.head()

# Labels

Read the patients' outcome and extract the temporal information for labelling the data

In [None]:
genderAge = pd.read_csv(PATH + 'PATIENTS.csv', usecols = ['SUBJECT_ID', 'GENDER', 'DOB'], parse_dates = ['DOB'])

In [None]:
admissions = pd.read_csv(PATH + 'ADMISSIONS.csv',  
                         usecols = ['SUBJECT_ID', 'HADM_ID', 'ADMISSION_TYPE', 'HOSPITAL_EXPIRE_FLAG',
                                    'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ETHNICITY', 'INSURANCE', 'DIAGNOSIS'],
                         parse_dates = ['ADMITTIME', 'DISCHTIME', 'DEATHTIME'])
admissions = admissions.merge(genderAge, on = 'SUBJECT_ID')

In [None]:
# Focus only on adults - Update 12.10.2021
removed_nan = admissions[['ADMITTIME', 'DOB']].dropna()
admissions['AGE'] = np.nan
admissions['AGE'][removed_nan.index] = [date.days for date in (removed_nan.ADMITTIME.dt.to_pydatetime() - removed_nan.DOB.dt.to_pydatetime())]
admissions.AGE /= 365

admissions = admissions[admissions.AGE > 18]

In [None]:
# Focus on last visits (as space between visit might change process)
admissions = admissions.loc[admissions['SUBJECT_ID'].drop_duplicates('last').index]

In [None]:
# Change times to hours since admission
admissions['Death'] = admissions['DEATHTIME'] - admissions['ADMITTIME']
admissions['LOS'] = admissions['DISCHTIME'] - admissions['ADMITTIME']

In [None]:
# Create index for easier search
admissions = admissions.set_index('SUBJECT_ID')
admissions.head()

In [None]:
assert len(admissions.HADM_ID.unique()) == len(admissions), \
    "Different patients have the same HADM_ID, might be a problem for the rest of the code"

# Transformation labs

In [None]:
# Remove unecessary HADM_ID
labs = labs[labs.HADM_ID.isin(admissions.HADM_ID)]

In [None]:
# Change time event to time since admission
labs['Time'] = labs.CHARTTIME.values - admissions.ADMITTIME.loc[labs.SUBJECT_ID].values

In [None]:
labs.head()

# Reformat

Clean reformating of the dataframe for saving

In [None]:
labs.rename(columns = {"SUBJECT_ID": "Patient",
                       "VALUENUM": "Value"}, inplace = True)
labs = labs.reset_index(drop = True)[['Patient', 'Time', 'Lab', 'Value']]
labs.head()

In [None]:
admissions.rename_axis(index = "Patient", inplace = True)
admissions.head()

# Selection First day

In [None]:
day = 1
first_day = pd.to_timedelta('{} day'.format(day))
admissions = admissions[admissions.LOS >= first_day]
labs = labs[(labs.Time < first_day) & (labs.Time > pd.to_timedelta('0 day')) &\
              labs.Patient.isin(admissions.index)]

# Cleaning labs

In [None]:
# Remove duplicates: same test multiple time at the same time
labs = labs[~labs.set_index(['Patient', 'Time', 'Lab']).index.duplicated(keep = False)]

In [None]:
# Pivot to hae test as columns
labs = labs.pivot(index = ['Patient', 'Time'], columns = 'Lab')

# Change index to have days
labs.index = labs.index.set_levels(labs.index.levels[1].total_seconds()/(3600.*24), 1)

In [None]:
# Remove empty lines
labs = labs.dropna(how = 'all')

In [None]:
labs

# Clean outcomes

In [None]:
# Remove patients with no labs
admissions = admissions.loc[labs.index.get_level_values(0).unique()]

In [None]:
admissions['Death'] = admissions['Death'].dt.total_seconds() / (24 * 60 * 60)

# Analyze available labs

Subselect features to use: find the largest subset of patients and features shared by at least 30 000 patients.

In [None]:
from FeatureAnalysis.analysis.eclat import eclat
from FeatureAnalysis.analysis.rendering import buildGraph

In [None]:
missing_data = labs.groupby('Patient').count()
missing_data[missing_data <= 0] = np.nan
missing_data[0 < missing_data] = 1

In [None]:
# This algorithm is slow and takes close to 2 hours to run
features = eclat(missing_data, minCount = 30000)
print(features)

In [None]:
def next_feature(node):
    if len(node.children) == 0:
        return [node.name]
    return [node.name] + next_feature(node.children[0])

In [None]:
# Remove patients with no values
selection = labs[next_feature(features.children[0])][next_feature(features.children[0])]
selection = (selection.groupby('Patient').count() == 0).sum(1) == 0
selection = selection.index[selection]

In [None]:
labs = labs[labs.index.get_level_values('Patient').isin(selection)][next_feature(features.children[0])]
admissions = admissions[admissions.index.get_level_values('Patient').isin(selection)]

# Save

Rename columns and save all the data and labels

In [None]:
labs.to_csv('data/labs_{}_day.csv'.format(day))
admissions.to_csv('data/outcomes_{}_day.csv'.format(day))


-------

# Clinical Presence Evidence

Compute the number of observations for the different group of interest.

In [None]:
import scipy.stats

In [None]:
test_count = labs.groupby('Patient').size()

death = (~admissions.Death.isna()).replace({False: 'Alive', True: 'Death'})
gender = (admissions.GENDER == 'F').replace({False: 'Male', True: 'Female'})
ethnicity = (admissions.ETHNICITY == 'WHITE').replace({False: 'Non White', True: 'White'})

In [None]:
# Average test per outcome
test_count.groupby(death).mean(), test_count.groupby(death).std(), scipy.stats.ttest_ind(test_count[death == "Alive"], test_count[death == "Death"])

In [None]:
# Average test per sex
test_count.groupby(gender).mean(), test_count.groupby(gender).std(), scipy.stats.ttest_ind(test_count[gender == "Female"], test_count[gender == "Male"])

In [None]:
# Average test per ethnicity
test_count.groupby(ethnicity).mean(), test_count.groupby(ethnicity).std(), scipy.stats.ttest_ind(test_count[ethnicity == 'White'], test_count[ethnicity == 'Non White'])