# MIMIC Results

In [None]:
import os
import numpy as np
import pandas as pd

import psycopg2
from tqdm.notebook import tqdm
tqdm.pandas()
from typing import List, Optional

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
sns.set_theme(style="white")
plt.rcParams.update({
    "font.family": "serif",
    "font.serif": ["Computer Modern"],
})

In [None]:
MIMIC_FOLDER = "" # YOUR FOLDER HERE
conn = psycopg2.connect("user=<YOUR USERNAME HERE> password=<YOUR PASSWORD HERE> host=127.0.0.1")

def build_query(
    table_name: str,
    column_names: Optional[List[str]] = None,
    conditions: Optional[List[str]] = None,
    limit: Optional[int] = None
):
    if column_names is not None:
        col_str = ",".join(column_names)
    else:
        col_str = "*"
    limit_str = ""
    if limit is not None:
        limit_str += f" LIMIT {limit}"
    condition_str = ""
    if conditions is not None:
        condition_str += (" WHERE " + " AND ".join(conditions))
    return f"SELECT {col_str} from {table_name}" + condition_str + limit_str


def run_query(query: str, preview: Optional[bool] = True, save_to: str = None, **kwargs):
    print("EXECUTING QUERY:", query)
    df = pd.read_sql_query(query, conn)
    if preview:
        display(df)
    if save_to is not None:
        print(f"Saved to {save_to}")
        df.to_csv(save_to, **kwargs)
    return df
        

## List tables in Postgres

Check to see if tables from MIMIC are present. Follow instructions from [the MIMIC repository](https://github.com/MIT-LCP/mimic-code/tree/main/mimic-iv/concepts) to generate the relevant Postgres tables.

In [None]:
CONCEPT_LIST_QUERY = build_query("pg_tables", conditions=["schemaname = 'mimic_derived'"])
run_query(CONCEPT_LIST_QUERY)

## Load data

In [None]:
patients_df = pd.read_csv(f"{MIMIC_FOLDER}/mimiciv/1.0/core/patients.csv")
hadms = pd.read_csv(f"{MIMIC_FOLDER}/mimiciv/1.0/core/admissions.csv", low_memory=False)
labs_df = pd.read_csv("/data4/mimiciv/1.0/hosp/labevents.csv", low_memory=False)
micros_df = pd.read_csv("/data4/mimiciv/1.0/hosp/microbiologyevents.csv", low_memory=False)
lab_items_df = pd.read_csv("/data4/mimiciv/1.0/hosp/d_labitems.csv", low_memory=False)

hadms["gender"] = hadms.apply(lambda x: patients_df[patients_df.subject_id == x.subject_id].gender.item(), axis=1)


In [None]:
white_hadms = set(hadms[hadms.ethnicity == "WHITE"].hadm_id.dropna().tolist())
black_hadms = set(hadms[hadms.ethnicity == "BLACK/AFRICAN AMERICAN"].hadm_id.dropna().tolist())

### Summarize data

In [None]:
print("# of all admissions:", len(np.unique(hadms.hadm_id)))
print("# of Black patient admissions:", len(np.unique(hadms[hadms.ethnicity == "BLACK/AFRICAN AMERICAN"].hadm_id)))
print("# of White patient admissions:", len(np.unique(hadms[hadms.ethnicity == "WHITE"].hadm_id)))

## CBC 

In [None]:
from statsmodels.stats.proportion import test_proportions_2indep 

def test_disparate_censorship(lab_df, white_hadms=white_hadms, black_hadms=black_hadms):
    lab_hadms = set(lab_df.hadm_id.dropna().tolist())

    Nt_a0 = len(lab_hadms & white_hadms) 
    Nt_a1 = len(lab_hadms & black_hadms)
    print(Nt_a0 + Nt_a1, f"({100 * (Nt_a0 + Nt_a1) / (len(white_hadms) + len(black_hadms)):.2f}%)", "admissions with tests found in study population")
    statistic, pval = test_proportions_2indep(Nt_a0, len(white_hadms), Nt_a1, len(black_hadms), value=0., compare='diff', alternative='two-sided')
    rate0, rate1 = Nt_a0 / len(white_hadms), Nt_a1 / len(black_hadms)
    print(f"P(T|WHITE) = {100*rate0:.2f}% ({Nt_a0}/{len(white_hadms)})")
    print(f"P(T|BLACK) = {100*rate1:.2f}% ({Nt_a1}/{len(black_hadms)})")
    print("z:", statistic, "p:", pval)
    return statistic, pval, rate0, rate1

In [None]:
cbc_df = run_query(build_query("mimic_derived.complete_blood_count", column_names=["*"]), preview=True)

In [None]:
z_cbc, p_cbc, w_cbc, b_cbc = test_disparate_censorship(cbc_df)

In [None]:
cbc_with_diff_df = run_query(build_query("mimic_derived.blood_differential", column_names=["*"]), preview=True)


In [None]:
cbc_diff_corrected = cbc_with_diff_df.drop(columns=["wbc"]).dropna(how="all",
                                                                        subset=["basophils_abs",
                                                                                "eosinophils_abs",
                                                                                "lymphocytes_abs",
                                                                                "monocytes_abs",
                                                                                "neutrophils_abs",
                                                                                "basophils",
                                                                                "eosinophils",
                                                                                "lymphocytes","monocytes",
                                                                                "neutrophils",
                                                                                "atypical_lymphocytes",
                                                                                "bands",
                                                                                "immature_granulocytes",
                                                                                "metamyelocytes",
                                                                                "nrbc"])
z_cbcdiff, p_cbcdiff, w_cbcdiff, b_cbcdiff = test_disparate_censorship(cbc_diff_corrected)

## Troponin T

In [None]:
cardiac_df = run_query(build_query("mimic_derived.cardiac_marker", column_names=["*"]), preview=True)


In [None]:
z_trop, p_trop, w_trop, b_trop = test_disparate_censorship(cardiac_df.dropna(subset=["troponin_t"]))


## BNP

In [None]:
z_bnp, p_bnp, w_bnp, b_bnp = test_disparate_censorship(labs_df[labs_df.itemid == 50963]) # BNP


## D-dimer

In [None]:
clot_df = run_query(build_query("mimic_derived.coagulation", column_names=["*"]), preview=True)


In [None]:
z_ddimer, p_ddimer, w_ddimer, b_ddimer = test_disparate_censorship(clot_df.dropna(subset=["d_dimer"]))


## ABG

In [None]:
abg_df = run_query(build_query("mimic_derived.bg", column_names=["*"]), preview=True)


In [None]:
z_abg, p_abg, w_abg, b_abg = test_disparate_censorship(abg_df.dropna(how="all", subset=["po2","pco2"]))

## BMP

In [None]:
bmp_df = run_query(build_query("mimic_derived.chemistry", column_names=["*"]), preview=True)


In [None]:
bmp_df_corrected = bmp_df.dropna(how="all", subset=["bicarbonate","bun","calcium","chloride","creatinine","glucose","sodium","potassium"])

z_bmp, p_bmp, w_bmp, b_bmp = test_disparate_censorship(bmp_df_corrected)

## Blood culture orders

In [None]:
blood_cultures_df = micros_df[(micros_df.spec_type_desc == "BLOOD CULTURE") & ~micros_df.hadm_id.isna()]


In [None]:
z_bc, p_bc, w_bc, b_bc = test_disparate_censorship(blood_cultures_df)

## CXR

Note: after re-running and regenerating CXR testing info, the CXR testing disparity does not appear to be significant. Thus, the generated figure slightly differs from the paper.

In [None]:
# created using notebook at https://github.com/MIT-LCP/mimic-cxr/blob/master/dcm/create-mimic-cxr-jpg-metadata.ipynb
cxrs = pd.read_csv("/path/to/mimic-cxr-2.0.0-metadata.csv.gz")

In [None]:
from datetime import datetime
from functools import partial
import re


def datestr_to_time(date_str):
    return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')

def select_hadm_id_by_time(record):
    def filter_nonnumerics(i, s):
        return int(''.join(re.findall(r'\d+', s.split()[i])))
    
    hadms_for_patient = hadms[hadms.subject_id == record.subject_id]
    if not len(hadms_for_patient): return None

    hadms_for_patient["admittime_numeric"] = hadms_for_patient.admittime.apply(partial(filter_nonnumerics, 0))
    hadms_for_patient["dischtime_numeric"] = hadms_for_patient.dischtime.apply(partial(filter_nonnumerics, 0))
    
    candidate_records = hadms_for_patient[(hadms_for_patient.admittime_numeric <= record.StudyDate) \
                      & (record.StudyDate <= hadms_for_patient.dischtime_numeric)]
    if not len(candidate_records): return None
    if len(candidate_records) > 1:
        candidate_records["admittime_tod_numeric"] = hadms_for_patient.admittime.apply(partial(filter_nonnumerics, 1))
        candidate_records["dischtime_tod_numeric"] = hadms_for_patient.dischtime.apply(partial(filter_nonnumerics, 1))

        candidate_records = candidate_records[(candidate_records.admittime_tod_numeric) <= int(record.StudyTime) \
                                             & (int(record.StudyTime) <= candidate_records.dischtime_tod_numeric)]
        if len(candidate_records) != 1: return None # not for an admission
    return candidate_records.hadm_id.item() 

cxrs["hadm_id"] = cxrs.progress_apply(select_hadm_id_by_time, axis=1)


In [None]:
len(cxrs.hadm_id.unique())

In [None]:
cxr_cohort = pd.unique(patients_df[patients_df.anchor_year_group.isin(["2011 - 2013", '2014 - 2016'])].subject_id)
cxr_hadms = set(hadms[hadms.subject_id.isin(cxr_cohort)].hadm_id)
white_cxr_hadms = white_hadms & cxr_hadms
black_cxr_hadms = black_hadms & cxr_hadms

z_cxr, p_cxr, w_cxr, b_cxr = test_disparate_censorship(cxrs, white_hadms=white_cxr_hadms, black_hadms=black_cxr_hadms)


## Disparate censorship in MIMIC: Plot

In [None]:


test_results = pd.DataFrame([
    ["cbc", w_cbc, "White"],
    ["cbc", b_cbc, "Black"],
    ["bmp", w_bmp, "White"],
    ["bmp", b_bmp, "Black"],
    ["cbc with\ndiff.", w_cbcdiff, "White"], 
    ["cbc with\ndiff.", b_cbcdiff, "Black"],
    ["blood\nculture", w_bc, "White"], 
    ["blood\nculture", b_bc, "Black"], 
    ["cxr", w_cxr, "White"],
    ["cxr", b_cxr, "Black"],
    ["abg", w_abg, "White"], 
    ["abg", b_abg, "Black"],
    ["troponin t", w_trop, "White"],
    ["troponin t", b_trop, "Black"], 
    ["bnp", w_bnp, 'White'],
    ["bnp", b_bnp, 'Black'], 
    ["d-dimer", w_ddimer, "White"], 
    ["d-dimer", b_ddimer, "Black"], 
], columns=["test", "test rate", "race"])
pvalues = {
    "cbc": p_cbc,
    "bmp": p_bmp,
    "cbc with\ndiff.": p_cbcdiff,
    "blood\nculture": p_bc,
    "cxr": p_cxr,
    "abg": p_abg,
    "troponin t":  p_trop,
    "bnp": p_bmp,
    "d-dimer": p_ddimer,
}


test_results["test rate"] *= 100
g = sns.catplot(
    data=test_results, kind="bar",
    x="test", y="test rate", hue="race",
    palette="mako", alpha=1, height=2., aspect=len(np.unique(test_results.test)) / 2.6,
    dodge=True,
)
g.set_axis_labels("Test name", "Testing rate")
plt.title("Disparate censorship in MIMIC-IV")
plt.vlines(np.arange(len(np.unique(test_results.test))) + 0.5, ymin=-10, ymax=42, color="#dddddd")

LINE_SP = 5
LINE_H = 8
BAR_OFFSET = 1/5
LW = 0.5
TEXT_H = 15
for i, (name, group) in enumerate(test_results.groupby("test", sort=False)):
    white_test_rate = group[group.race=="White"]["test rate"].item()
    black_test_rate = group[group.race=="Black"]["test rate"].item()
    line_top = max(white_test_rate, black_test_rate) + LINE_H
    plt.vlines(i  - BAR_OFFSET, ymin=white_test_rate, ymax=line_top, color='black', linewidth=LW)
    plt.vlines(i + BAR_OFFSET, ymin=black_test_rate, ymax=line_top, color='black', linewidth=LW)
    plt.hlines(line_top, xmin=i-BAR_OFFSET, xmax=i+BAR_OFFSET, color='black', linewidth=LW)
    plt.text(i, line_top + LINE_SP, "**" if pvalues[name] < 0.05 / len(pvalues) else f"p={pvalues[name]:.2f}", horizontalalignment='center')

plt.xlim((-0.5, len(np.unique(test_results.test)) - 0.5))
plt.ylim((0, max(test_results["test rate"]) + LINE_H + TEXT_H))
sns.move_legend(g, "upper right", bbox_to_anchor=(0.91, 1.), title="Race")
plt.savefig("mimic_testing_disparities.pdf", bbox_inches='tight')

