In [60]:
import pandas as pd
import numpy as np
import os

data_dir = 'mimic-iii-clinical-database-1.4/' 

def get_all_patients():
    admissions = pd.read_csv(os.path.join(data_dir, 'ADMISSIONS.csv'),
                             usecols=['SUBJECT_ID', 'HADM_ID'],
                             dtype={'SUBJECT_ID': int, 'HADM_ID': int})

    all_patients = admissions[['SUBJECT_ID', 'HADM_ID']].drop_duplicates()

    return all_patients

# Extract demographic data
def get_demographics(hadm_ids):
    patients = pd.read_csv(os.path.join(data_dir, 'PATIENTS.csv'),
                           parse_dates=['DOB'],
                           dtype={'SUBJECT_ID': int})

    admissions = pd.read_csv(os.path.join(data_dir, 'ADMISSIONS.csv'),
                             parse_dates=['ADMITTIME', 'DISCHTIME', 'DEATHTIME'],
                             dtype={'SUBJECT_ID': int, 'HADM_ID': int, 'HOSPITAL_EXPIRE_FLAG': int})

    icustays = pd.read_csv(os.path.join(data_dir, 'ICUSTAYS.csv'),
                           parse_dates=['INTIME', 'OUTTIME'],
                           dtype={'SUBJECT_ID': int, 'HADM_ID': int, 'ICUSTAY_ID': int})

    admissions = admissions[admissions['HADM_ID'].isin(hadm_ids)]
    icustays = icustays[icustays['HADM_ID'].isin(hadm_ids)]

    demographics = admissions.merge(patients, on='SUBJECT_ID', how='left')
    demographics = demographics.merge(icustays, on=['SUBJECT_ID', 'HADM_ID'], how='left')
    demographics['DOB'] = pd.to_datetime(demographics['DOB'], errors='coerce').dt.year
    # demographics['ADMITTIME'] = pd.to_datetime(demographics['ADMITTIME'], errors='coerce').dt.year
    ADMITTIME_var= pd.to_datetime(demographics['ADMITTIME'], errors='coerce').dt.year
    demographics['AGE'] = (ADMITTIME_var - demographics['DOB'])#.dt.days / 365.25
    demographics.loc[demographics['AGE'] < 0, 'AGE'] = np.nan  # Correct negative ages
    demographics.loc[demographics['AGE'] > 300, 'AGE'] = 90  # De-identification in MIMIC-III

    # Calculate length of stay
    demographics['LOS_HOSPITAL'] = (demographics['DISCHTIME'] - demographics['ADMITTIME']).dt.total_seconds() / 86400
    demographics['LOS_ICU'] = (demographics['OUTTIME'] - demographics['INTIME']).dt.total_seconds() / 86400

    # Select required columns
    demographics = demographics[[
        'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'GENDER', 'ETHNICITY', 'AGE',
        'LOS_HOSPITAL', 'LOS_ICU', 'HOSPITAL_EXPIRE_FLAG'
    ]].drop_duplicates()

    return demographics

#  Extract vital signs from the first 24 hours of ICU stay
def get_vital_signs(icustay_ids):
    vital_signs_items = {
        'HeartRate': [211, 220045],
        'SysBP': [51, 442, 455, 6701, 220179, 220050],
        'DiasBP': [8368, 8441, 8555, 220180, 220051],
        'MeanBP': [52, 6702, 443, 456, 220181, 225312],
        'RespRate': [618, 615, 220210, 224690],
        'Temperature': [223761, 678, 676, 223762],
        'SpO2': [646, 220277],
    }

    item_ids = []
    item_labels = []
    for label, ids in vital_signs_items.items():
        item_ids.extend(ids)
        item_labels.extend([label] * len(ids))

    itemid_label_map = pd.DataFrame({'ITEMID': item_ids, 'VITAL_SIGN': item_labels})

    vital_signs_list = []

    chunksize = 10 ** 6
    for chunk in pd.read_csv(os.path.join(data_dir, 'CHARTEVENTS.csv'),
                             chunksize=chunksize,
                             parse_dates=['CHARTTIME'],
                             usecols=['ICUSTAY_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM'],
                             dtype={'ICUSTAY_ID': float, 'ITEMID': int, 'VALUENUM': float}):
        chunk = chunk[chunk['ICUSTAY_ID'].isin(icustay_ids)]
        chunk = chunk[chunk['ITEMID'].isin(item_ids)]
        chunk = chunk.merge(itemid_label_map, on='ITEMID', how='left')
        chunk = chunk.dropna(subset=['VALUENUM'])

        vital_signs_list.append(chunk)

    # Concatenate all chunks
    if vital_signs_list:
        vital_signs = pd.concat(vital_signs_list, ignore_index=True)
    else:
        vital_signs = pd.DataFrame(columns=['ICUSTAY_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM', 'VITAL_SIGN'])

    icustays = pd.read_csv(os.path.join(data_dir, 'ICUSTAYS.csv'),
                           parse_dates=['INTIME'],
                           dtype={'ICUSTAY_ID': int})

    icustays = icustays[icustays['ICUSTAY_ID'].isin(icustay_ids)][['ICUSTAY_ID', 'INTIME']]

    vital_signs = vital_signs.merge(icustays, on='ICUSTAY_ID', how='left')

    # Calculate time difference
    vital_signs['HOURS_FROM_ICU_ADMIT'] = (vital_signs['CHARTTIME'] - vital_signs['INTIME']).dt.total_seconds() / 3600

    # Keep data from the first 24 hours
    vital_signs = vital_signs[(vital_signs['HOURS_FROM_ICU_ADMIT'] >= 0) & (vital_signs['HOURS_FROM_ICU_ADMIT'] <= 24)]

    # Aggregate vital signs
    agg_vital_signs = vital_signs.groupby(['ICUSTAY_ID', 'VITAL_SIGN'])['VALUENUM'].agg(['mean', 'min', 'max']).reset_index()
    pivot_vital_signs = agg_vital_signs.pivot(index='ICUSTAY_ID', columns='VITAL_SIGN')
    pivot_vital_signs.columns = ['_'.join(col).strip() for col in pivot_vital_signs.columns.values]
    pivot_vital_signs = pivot_vital_signs.reset_index()

    return pivot_vital_signs

#  Extract laboratory values from the first 24 hours of ICU stay
def get_lab_values(hadm_ids):
    lab_items = {
        'WBC': [51300],
        'Hemoglobin': [51222],
        'Hematocrit': [51221],
        'Platelets': [51265],
        'Glucose': [50931],
        'Creatinine': [50912],
        'BUN': [51006],
        'Sodium': [50983],
        'Potassium': [50971],
        'Chloride': [50902],
        'CO2': [50882],
    }

    item_ids = []
    item_labels = []
    for label, ids in lab_items.items():
        item_ids.extend(ids)
        item_labels.extend([label] * len(ids))

    itemid_label_map = pd.DataFrame({'ITEMID': item_ids, 'LAB_TEST': item_labels})

    lab_values_list = []

    chunksize = 10 ** 6
    for chunk in pd.read_csv(os.path.join(data_dir, 'LABEVENTS.csv'),
                             chunksize=chunksize,
                             parse_dates=['CHARTTIME'],
                             usecols=['HADM_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM'],
                             dtype={'HADM_ID': float, 'ITEMID': int, 'VALUENUM': float}):
        chunk = chunk[chunk['HADM_ID'].isin(hadm_ids)]
        chunk = chunk[chunk['ITEMID'].isin(item_ids)]
        chunk = chunk.merge(itemid_label_map, on='ITEMID', how='left')
        chunk = chunk.dropna(subset=['VALUENUM'])

        lab_values_list.append(chunk)

    # Concatenate all chunks
    if lab_values_list:
        lab_values = pd.concat(lab_values_list, ignore_index=True)
    else:
        lab_values = pd.DataFrame(columns=['HADM_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM', 'LAB_TEST'])

    admissions = pd.read_csv(os.path.join(data_dir, 'ADMISSIONS.csv'),
                             parse_dates=['ADMITTIME'],
                             dtype={'HADM_ID': int})

    admissions = admissions[admissions['HADM_ID'].isin(hadm_ids)][['HADM_ID', 'ADMITTIME']]

    lab_values = lab_values.merge(admissions, on='HADM_ID', how='left')

    lab_values['HOURS_FROM_ADMIT'] = (lab_values['CHARTTIME'] - lab_values['ADMITTIME']).dt.total_seconds() / 3600

    lab_values = lab_values[(lab_values['HOURS_FROM_ADMIT'] >= 0) & (lab_values['HOURS_FROM_ADMIT'] <= 24)]

    agg_lab_values = lab_values.groupby(['HADM_ID', 'LAB_TEST'])['VALUENUM'].agg(['mean', 'min', 'max']).reset_index()
    pivot_lab_values = agg_lab_values.pivot(index='HADM_ID', columns='LAB_TEST')
    pivot_lab_values.columns = ['_'.join(col).strip() for col in pivot_lab_values.columns.values]
    pivot_lab_values = pivot_lab_values.reset_index()

    return pivot_lab_values

# Extract interventions (mechanical ventilation, renal replacement therapy)
def get_interventions(icustay_ids, hadm_ids):
    # Mechanical ventilation
    # Assuming presence of mechanical ventilation if certain ITEMIDs are charted

    mv_itemids = [720, 721, 722, 223848, 223849]

    mv_list = []

    chunksize = 10 ** 6
    for chunk in pd.read_csv(os.path.join(data_dir, 'CHARTEVENTS.csv'),
                             chunksize=chunksize,
                             usecols=['ICUSTAY_ID', 'ITEMID'],
                             dtype={'ICUSTAY_ID': float, 'ITEMID': int}):
        chunk = chunk[chunk['ICUSTAY_ID'].isin(icustay_ids)]
        chunk = chunk[chunk['ITEMID'].isin(mv_itemids)]

        mv_list.append(chunk[['ICUSTAY_ID']].drop_duplicates())

    if mv_list:
        mv_df = pd.concat(mv_list, ignore_index=True).drop_duplicates()
        mv_df['MECHANICAL_VENTILATION'] = 1
    else:
        mv_df = pd.DataFrame(columns=['ICUSTAY_ID', 'MECHANICAL_VENTILATION'])

    # Renal replacement therapy (RRT)
    # Assuming presence of RRT if certain procedure codes are present

    # Read PROCEDURES_ICD.csv
    procedures_icd = pd.read_csv(os.path.join(data_dir, 'PROCEDURES_ICD.csv'),
                                 dtype={'ICD9_CODE': str, 'HADM_ID': int})

    # ICD-9 codes for RRT
    rrt_icd9_codes = ['39.95', '54.98', '38.95']

    rrt_df = procedures_icd[procedures_icd['ICD9_CODE'].isin(rrt_icd9_codes)]
    rrt_df = rrt_df[['HADM_ID']].drop_duplicates()
    rrt_df['RENAL_REPLACEMENT_THERAPY'] = 1

    # Merge interventions
    interventions = pd.DataFrame({'ICUSTAY_ID': icustay_ids})
    interventions = interventions.merge(mv_df, on='ICUSTAY_ID', how='left')
    interventions = interventions.fillna({'MECHANICAL_VENTILATION': 0})

    # Map ICUSTAY_ID to HADM_ID
    icustays = pd.read_csv(os.path.join(data_dir, 'ICUSTAYS.csv'),
                           dtype={'ICUSTAY_ID': int, 'HADM_ID': int})
    interventions = interventions.merge(icustays[['ICUSTAY_ID', 'HADM_ID']], on='ICUSTAY_ID', how='left')

    interventions = interventions.merge(rrt_df, on='HADM_ID', how='left')
    interventions = interventions.fillna({'RENAL_REPLACEMENT_THERAPY': 0})

    interventions = interventions[['ICUSTAY_ID', 'MECHANICAL_VENTILATION', 'RENAL_REPLACEMENT_THERAPY']]

    return interventions

# Step 6: Extract comorbidities (e.g., diabetes, malignant tumor)
def get_comorbidities(hadm_ids):
    # ICD-9 codes for comorbidities
    comorbidity_codes = {
        'DIABETES': ['25000', '25002', '25040', '25060', '25090'],
        'MALIGNANT_TUMOR': ['1400', '20936', '20970', '20974', '20975', '20979']
    }

    # Read DIAGNOSES_ICD.csv
    diagnoses_icd = pd.read_csv(os.path.join(data_dir, 'DIAGNOSES_ICD.csv'),
                                dtype={'ICD9_CODE': str, 'HADM_ID': int})

    comorbidities = pd.DataFrame({'HADM_ID': hadm_ids})

    for comorbidity, codes in comorbidity_codes.items():
        comorbidity_df = diagnoses_icd[diagnoses_icd['ICD9_CODE'].isin(codes)]
        comorbidity_df = comorbidity_df[['HADM_ID']].drop_duplicates()
        comorbidity_df[comorbidity] = 1
        comorbidities = comorbidities.merge(comorbidity_df, on='HADM_ID', how='left')

    comorbidities = comorbidities.fillna(0)

    return comorbidities

# Step 7: Merge all data into a single DataFrame
def merge_data(demographics, vital_signs, lab_values, interventions, comorbidities):
    # Merge on ICUSTAY_ID and HADM_ID as appropriate
    data = demographics.merge(vital_signs, on='ICUSTAY_ID', how='left')
    data = data.merge(interventions, on='ICUSTAY_ID', how='left')
    data = data.merge(lab_values, on='HADM_ID', how='left')
    data = data.merge(comorbidities, on='HADM_ID', how='left')

    # Fill missing values  NEED REVISION HERE!!!
    data = data.fillna(0)

    return data

def main():
    all_patients = get_all_patients()
    hadm_ids = all_patients['HADM_ID'].unique()
    subject_ids = all_patients['SUBJECT_ID'].unique()

    demographics = get_demographics(hadm_ids)
    icustay_ids = demographics['ICUSTAY_ID'].dropna().astype(int).unique()

    vital_signs = get_vital_signs(icustay_ids)

    lab_values = get_lab_values(hadm_ids)

    interventions = get_interventions(icustay_ids, hadm_ids)

    comorbidities = get_comorbidities(hadm_ids)

    final_data = merge_data(demographics, vital_signs, lab_values, interventions, comorbidities)

    final_data.to_csv('all_patients_data.csv', index=False)

    print("Data extraction completed. The data is saved to 'all_patients_data.csv'.")


In [61]:
main()

Data extraction completed. The data is saved to 'all_patients_data.csv'.


In [67]:
data=pd.read_csv("all_patients_data.csv")

trail_data=pd.read_csv("cleaned_data.csv")

In [77]:
data.shape

(62722, 67)

In [69]:
trail_data.columns

Index(['icustay_id', 'hadm_id', 'intime', 'outtime', 'dbsource',
       'suspected_infection_time_poe', 'suspected_infection_time_poe_days',
       'specimen_poe', 'positiveculture_poe', 'antibiotic_time_poe',
       ...
       'spo2_max', 'spo2_mean', 'glucose_min1', 'glucose_max1', 'glucose_mean',
       'rrt', 'subject_id', 'urineoutput', 'colloid_bolus',
       'crystalloid_bolus'],
      dtype='object', length=104)