## Extract basic patient information (adult patients with ICU stay longer than 24 hours)

In [1]:
import numpy as np
import dill
import pandas as pd

import datetime as dt
from tqdm import tqdm
from collections import defaultdict, Counter

import warnings
warnings.filterwarnings("ignore")

In [None]:
ADMISSIONS = pd.read_csv('input/mimiciii/ADMISSIONS.csv.gz')
ICUSTAYS = pd.read_csv('input/mimiciii/ICUSTAYS.csv.gz')
PATIENTS = pd.read_csv('input/mimiciii/PATIENTS.csv.gz')

# ADMISSIONS = pd.read_csv('input/mimiciv/admissions.csv.gz')
# ICUSTAYS = pd.read_csv('input/mimiciiv/icustays.csv.gz')
# PATIENTS = pd.read_csv('input/mimiciv/patients.csv.gz')

ADMISSIONS.columns = ADMISSIONS.columns.str.upper()
ICUSTAYS.columns = ICUSTAYS.columns.str.upper()
PATIENTS.columns = PATIENTS.columns.str.upper()

ADMISSIONS = ADMISSIONS.rename(columns={'ETHNICITY':'RACE'})
ICUSTAYS = ICUSTAYS.rename(columns={'STAY_ID':'ICUSTAY_ID'})

In [290]:
def MIMICiv(ADMISSIONS,ICUSTAYS,PATIENTS):
    # Select relevant columns for ADMISSIONS, ICUSTAYS, and PATIENTS
    ADMISSIONS = ADMISSIONS[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'RACE']]
    ICUSTAYS = ICUSTAYS[['HADM_ID', 'ICUSTAY_ID', 'FIRST_CAREUNIT', 'LAST_CAREUNIT', 'INTIME', 'OUTTIME', 'LOS']]
    PATIENTS['DOB'] = PATIENTS['ANCHOR_YEAR'] - PATIENTS['ANCHOR_AGE']
    PATIENTS = PATIENTS[['SUBJECT_ID', 'GENDER', 'DOB', 'DOD']]
    
    # Merge the datasets
    df = pd.merge(PATIENTS, ADMISSIONS, on='SUBJECT_ID', how='left')
    df = pd.merge(df, ICUSTAYS, on='HADM_ID', how='left')
    print(df.shape)
    
    # Convert date columns to datetime
    date_columns = ['ADMITTIME', 'DISCHTIME', 'INTIME', 'OUTTIME']
    df[date_columns] = df[date_columns].apply(pd.to_datetime)
    
    # Create 'DOB' as a complete date by assuming January 1st for each year
    df['DOB'] = pd.to_datetime(df['DOB'].astype(str) + '-01-01')
    
    # Calculate 'AGE' based on 'ADMITTIME' and 'DOB', and adjust for ages above 89
    df['AGE'] = ((df['ADMITTIME'].dt.date - df['DOB'].dt.date) / 365.242).dt.days
    df['AGE'] = df['AGE'].apply(lambda x: 91 if x > 89 else x)
    
    # Filter out patients under 18
    df = df[df['AGE'] >= 18]
    print('AGE >= 18',df.shape)
    
    # Select specific columns for the final dataframe
    df = df[['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'ADMITTIME', 'INTIME', 'DISCHTIME', 'OUTTIME', 
             'GENDER', 'DOB', 'DOD', 'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'RACE', 
             'FIRST_CAREUNIT', 'LAST_CAREUNIT', 'LOS', 'AGE']]
    
    # Sort the dataframe by 'SUBJECT_ID' and 'INTIME'
    df = df.sort_values(by=['SUBJECT_ID', 'INTIME'])
    
    # Drop rows where 'INTIME' is missing
    df = df.dropna(subset=['INTIME'])
    
    # Create a flag for the first admission for each patient
    df['FIRST_HADM'] = df.groupby('SUBJECT_ID')['ADMITTIME'].transform(lambda x: x == x.min()).astype(int)
    
    # Create a flag for the first ICU stay for each patient-HADM combination
    df['FIRST_ICU'] = df.groupby(['SUBJECT_ID', 'HADM_ID'])['INTIME'].transform(lambda x: x == x.min()).astype(int)
    
    # Filter for rows where both FIRST_HADM and FIRST_ICU are 1
    df = df[(df['FIRST_HADM'] == 1) & (df['FIRST_ICU'] == 1)]
    print('FIRST_HADM, FIRST_ICU',df.shape)
    
    # Keep rows with LOS greater than 1
    df = df[df['LOS'] > 1]
    print('LOS',df.shape)
    
    # Drop rows with missing critical columns
    df = df.dropna(subset=['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'ADMITTIME', 'INTIME', 'DOB'])
    print(df.shape)

    return df

In [291]:
def MIMICiii(ADMISSIONS,ICUSTAYS,PATIENTS):
    ADMISSIONS = ADMISSIONS[['SUBJECT_ID', 'HADM_ID','ADMITTIME', 'DISCHTIME','DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION','RACE','HAS_CHARTEVENTS_DATA']]
    ICUSTAYS = ICUSTAYS[['HADM_ID', 'ICUSTAY_ID','DBSOURCE','FIRST_CAREUNIT', 'LAST_CAREUNIT', 'FIRST_WARDID', 'LAST_WARDID','INTIME', 'OUTTIME', 'LOS']]
    PATIENTS = PATIENTS[['SUBJECT_ID', 'GENDER', 'DOB', 'DOD']]

    # Step 1: Merge DataFrames
    df = pd.merge(PATIENTS, ADMISSIONS, on='SUBJECT_ID', how='left')
    df = pd.merge(df, ICUSTAYS, on='HADM_ID', how='left')
    print(df.shape)
    
    # Step 2: Convert date columns to datetime format
    date_columns = ['DOB', 'ADMITTIME', 'DISCHTIME', 'INTIME', 'OUTTIME']
    df[date_columns] = df[date_columns].apply(pd.to_datetime)
    
    # Step 3: Calculate AGE
    df['AGE'] = ((df['ADMITTIME'].dt.date - df['DOB'].dt.date) / 365.242).dt.days
    df['AGE'] = df['AGE'].apply(lambda x: 91 if x > 89 else x)
    
    # Step 4: Filter based on AGE, HAS_CHARTEVENTS_DATA
    df = df[df['AGE'] >= 18]
    df = df[df['HAS_CHARTEVENTS_DATA'] == 1]
    print('AGE >= 18',df.shape)
    
    # Step 5: Select relevant columns
    columns_to_keep = [
        'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'GENDER', 'ADMITTIME', 'INTIME', 'DISCHTIME', 'OUTTIME',
        'DOB', 'DOD', 'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'RACE', 'HAS_CHARTEVENTS_DATA',
        'DBSOURCE', 'FIRST_CAREUNIT', 'LAST_CAREUNIT', 'FIRST_WARDID', 'LAST_WARDID', 'LOS', 'AGE'
    ]
    df = df[columns_to_keep]
    
    # Step 6: Sort by SUBJECT_ID and INTIME, and drop rows with missing INTIME
    df = df.sort_values(by=['SUBJECT_ID', 'INTIME'])
    df = df.dropna(subset=['INTIME'])
    
    # Step 7: Create the FIRST_HADM column (mark the first ADMITTIME for each SUBJECT_ID)
    df['FIRST_HADM'] = df.groupby('SUBJECT_ID')['ADMITTIME'].transform(lambda x: x == x.min()).astype(int)
    
    # Step 8: Create the FIRST_ICU column (mark the first INTIME for each HADM_ID within SUBJECT_ID)
    df['FIRST_ICU'] = df.groupby(['SUBJECT_ID', 'HADM_ID'])['INTIME'].transform(lambda x: x == x.min()).astype(int)
    
    # Step 9: Filter to keep only rows where both FIRST_HADM and FIRST_ICU are 1
    df = df[(df['FIRST_HADM'] == 1) & (df['FIRST_ICU'] == 1)]
    print('FIRST_HADM, FIRST_ICU',df.shape)
    
    # Step 10: Filter for LOS greater than 1
    df = df[df['LOS'] > 1]
    print('LOS',df.shape)

    # Drop rows with missing critical columns
    df = df.dropna(subset=['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'ADMITTIME', 'INTIME', 'DOB'])
    print(df.shape)

    return df

In [279]:
df = MIMICiii(ADMISSIONS,ICUSTAYS,PATIENTS)
df.shape

(62722, 21)
AGE >= 18 (52916, 22)
FIRST_HADM, FIRST_ICU (38470, 24)
LOS (32557, 24)
(32557, 24)


(32557, 24)

In [292]:
df = MIMICiv(ADMISSIONS,ICUSTAYS,PATIENTS)
df.shape

(696419, 17)
AGE >= 18 (555243, 18)
FIRST_HADM, FIRST_ICU (65366, 20)
LOS (51837, 20)
(51837, 20)


(51837, 20)