In [1]:
from tqdm import tqdm
import pandas as pd
import seaborn as sns
import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def get_subjects(path):
    if "train" in os.listdir(path):
        subjects_1 = os.listdir(os.path.join(path, "train"))
        subjects_1 = [os.path.join(path, "train", x) for x in subjects_1 if ".csv" not in x]

        subjects_2 = os.listdir(os.path.join(path, "test"))
        subjects_2 = [os.path.join(path, "test", x) for x in subjects_2 if ".csv" not in x]

        subjects = subjects_1 + subjects_2
        del subjects_1
        del subjects_2
    else:
        subjects = os.listdir(path)
    return subjects


def create_combined_df(subjects):
    dfs = []
    for subj in tqdm(subjects):
        patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(subj)))
        for ts in patient_ts_files:
            file = pd.read_csv(os.path.join(subj, ts))
            file["subject_id"] = subj
            dfs.append(file)
    combined = pd.concat(dfs)
    return combined

In [3]:
path = "../data/output/"
subjects = get_subjects(path)
combined_full = create_combined_df(subjects)

100%|██████████| 47046/47046 [05:28<00:00, 143.39it/s]


In [4]:
ards_path = "../data/ards_icd"
ards_subj = get_subjects(ards_path)
combined_ards = create_combined_df(ards_subj)

100%|██████████| 435/435 [00:01<00:00, 239.04it/s]


In [5]:
combined_full["id"] = 0
combined_ards["id"] = 1
combined_all = pd.concat([combined_full, combined_ards])

In [6]:
combined_all["creatinine"] = np.clip(combined_all["creatinine"], 0, 10)

In [7]:
cont_dict = {
    "albumin": [0, 10, 1],
    "bicarbonate": [0, 50, 5],
    "bilirubin": [0, 20, 1],
    "creatinine": [0, 10, 1],
    "diastolic blood pressure": [0, 200, 25],
    "fraction inspired oxygen": [0, 1.1, 0.1],
    "glucose": [0, 600, 50],
    "heart rate": [0, 200, 10],
    "height": [120, 200, 10],
    "hematocrit": [0, 70, 5],
    "hemoglobin": [0, 30, 2],
    "lactate": [0, 15, 1],
    "mean blood pressure": [0, 200, 10],
    "oxygen saturation": [90, 101, 1],
    "partial pressure of carbon dioxide": [0, 100, 10],
    "partial pressure of oxygen": [0, 500, 50],
    "ph": [6, 8, 0.1],
    "platelets": [0, 800, 25],
    "positive end-expiratory pressure": [0, 30, 3],
    "prothrombin time": [0, 4, 0.2],
    "red blood cell count": [0, 8, 1],
    "red blood cell distribution width": [0, 40, 2],
    "respiratory rate": [0, 50, 5],
    "systolic blood pressure": [0, 200, 25],
    "temperature": [34, 40, 1],
    "troponin-t": [0, 5, 0.5],
    "urine output": [0, 700, 100],
    "weight": [0, 300, 20],
    "white blood cell count": [0, 50, 5],
    "Age": [20, 100, 10],
}

discrete_dict = {
    "pulse",
    "Ethnicity",
    "Gender",
    "vent"
}

In [3]:
os.makedirs("./image_files/", exist_ok=True)

In [40]:
for key, value in cont_dict.items():
    sns.set(style="darkgrid")
    combined_full[key] = np.clip(combined_full[key], value[0], value[1])
    combined_ards[key] = np.clip(combined_ards[key], value[0], value[1])
    fig = sns.kdeplot(x=np.array(combined_full[key]), color = "r")
    fig = sns.kdeplot(x=np.array(combined_ards[key]), color = "b")
    plt.xlabel([key], fontsize=10)
    plt.legend(title='', loc='upper right', labels=['Full', 'ARDS'])
    plt.savefig("./image_files/" + key + ".png")
    plt.clf()

<Figure size 432x288 with 0 Axes>

In [73]:
import matplotlib.patches as mpatches

x, y, hue = "Gender", "proportion", "id"

sns.set(style="darkgrid")

(combined_all[x]
 .groupby(combined_all[hue])
 .value_counts(normalize=True)
 .rename(y)
 .reset_index()
 .pipe((sns.barplot, "data"), x=x, y=y, hue=hue, palette=["red", "blue"]))

red_patch = mpatches.Patch(color='red')
blue_patch = mpatches.Patch(color='blue')
plt.legend(title='', loc='upper left', labels=['All', 'ARDS'], labelcolor = ["red", "blue"], handles=[red_patch, blue_patch])

plt.savefig("./image_files/Gender.png")
plt.clf()

<Figure size 432x288 with 0 Axes>

In [78]:
import matplotlib.patches as mpatches

x, y, hue = "Ethnicity", "proportion", "id"

sns.set(style="darkgrid")

(combined_all[x]
 .groupby(combined_all[hue])
 .value_counts(normalize=True)
 .rename(y)
 .reset_index()
 .pipe((sns.barplot, "data"), x=x, y=y, hue=hue, palette=["red", "blue"]))

red_patch = mpatches.Patch(color='red')
blue_patch = mpatches.Patch(color='blue')
plt.legend(title='', loc='upper left', labels=['All', 'ARDS'], labelcolor = ["red", "blue"], handles=[red_patch, blue_patch])

plt.savefig("./image_files/Ethnicity.png")
plt.clf()

<Figure size 432x288 with 0 Axes>

In [79]:
import matplotlib.patches as mpatches

x, y, hue = "pulse", "proportion", "id"

sns.set(style="darkgrid")

(combined_all[x]
 .groupby(combined_all[hue])
 .value_counts(normalize=True)
 .rename(y)
 .reset_index()
 .pipe((sns.barplot, "data"), x=x, y=y, hue=hue, palette=["red", "blue"]))

red_patch = mpatches.Patch(color='red')
blue_patch = mpatches.Patch(color='blue')
plt.legend(title='', loc='upper left', labels=['All', 'ARDS'], labelcolor = ["red", "blue"], handles=[red_patch, blue_patch])

plt.savefig("./image_files/pulse.png")
plt.clf()

<Figure size 432x288 with 0 Axes>

In [15]:
listfile_train = "../data/in-hospital-mortality_v4/train_listfile.csv"
train_mort = pd.read_csv(listfile_train)
listfile_test = "../data/in-hospital-mortality_v4/test_listfile.csv"
test = pd.read_csv(listfile_test)
full_mort = pd.concat([train_mort, test])
full_mort["id"] = 0
full_mort = full_mort.rename({"y_true": "mortality"}, axis=1)

listfile_train = "../data/ards_ihm/train_listfile.csv"
train_mort = pd.read_csv(listfile_train)
listfile_test = "../data/ards_ihm/test_listfile.csv"
test = pd.read_csv(listfile_test)
ards_mort = pd.concat([train_mort, test])
ards_mort["id"] = 1
ards_mort = ards_mort.rename({"y_true": "mortality"}, axis=1)

all_mort = pd.concat([full_mort, ards_mort])

In [20]:
import matplotlib.patches as mpatches

x, y, hue = "mortality", "proportion", "id"

sns.set(style="darkgrid")

(all_mort[x]
 .groupby(all_mort[hue])
 .value_counts(normalize=True)
 .rename(y)
 .reset_index()
 .pipe((sns.barplot, "data"), x=x, y=y, hue=hue, palette=["red", "blue"]))

red_patch = mpatches.Patch(color='red')
blue_patch = mpatches.Patch(color='blue')
plt.legend(title='', loc='upper left', labels=['All', 'ARDS'], labelcolor = ["red", "blue"], handles=[red_patch, blue_patch])

plt.savefig("./image_files/mortality.png")
plt.clf()

<Figure size 432x288 with 0 Axes>

In [9]:
import matplotlib.patches as mpatches

x, y, hue = "vent", "proportion", "id"

sns.set(style="darkgrid")

(combined_all[x]
 .groupby(combined_all[hue])
 .value_counts(normalize=True)
 .rename(y)
 .reset_index()
 .pipe((sns.barplot, "data"), x=x, y=y, hue=hue, palette=["red", "blue"]))

red_patch = mpatches.Patch(color='red')
blue_patch = mpatches.Patch(color='blue')
plt.legend(title='', loc='upper left', labels=['All', 'ARDS'], labelcolor = ["red", "blue"], handles=[red_patch, blue_patch])

plt.savefig("./image_files/vent.png")
plt.clf()

<Figure size 432x288 with 0 Axes>