In [1]:
from tqdm import tqdm
import pandas as pd
import seaborn as sns
import os
import numpy as np
import matplotlib.pyplot as plt
import statsmodels

In [2]:
def get_subjects(path):
    if "train" in os.listdir(path):
        subjects_1 = os.listdir(os.path.join(path, "train"))
        subjects_1 = [os.path.join(path, "train", x) for x in subjects_1 if ".csv" not in x]

        subjects_2 = os.listdir(os.path.join(path, "test"))
        subjects_2 = [os.path.join(path, "test", x) for x in subjects_2 if ".csv" not in x]

        subjects = subjects_1 + subjects_2
        del subjects_1
        del subjects_2
    else:
        subjects = os.listdir(path)
    return subjects


def create_combined_df(subjects):
    dfs = []
    for subj in tqdm(subjects):
        patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(subj)))
        for ts in patient_ts_files:
            file = pd.read_csv(os.path.join(subj, ts))
            file["subject_id"] = subj
            dfs.append(file)
    combined = pd.concat(dfs)
    return combined

In [3]:
path = "../data/output/"
subjects = get_subjects(path)
combined_full = create_combined_df(subjects)

100%|██████████| 47046/47046 [03:43<00:00, 210.03it/s]


In [4]:
ards_path = "../data/ards_icd"
ards_subj = get_subjects(ards_path)
combined_ards = create_combined_df(ards_subj)

100%|██████████| 435/435 [00:01<00:00, 219.10it/s]


In [5]:
combined_full["id"] = "All"
combined_ards["id"] = "ARDS"
combined_all = pd.concat([combined_full, combined_ards])
combined_all = combined_all.reset_index()

In [6]:
import pickle
with open("combined_full.pkl", "wb") as f:
    pickle.dump(combined_full, f)
with open("combined_all.pkl", "wb") as f:
    pickle.dump(combined_all, f)
with open("combined_ards.pkl", "wb") as f:
    pickle.dump(combined_ards, f)

# with open("combined_full.pkl", "rb") as f:
#     combined_full = pickle.load(f)
# with open("combined_all.pkl", "rb") as f:
#     combined_all = pickle.load(f)
# with open("combined_ards.pkl", "rb") as f:
#     combined_ards = pickle.load(f)

In [14]:
cont_dict = {
    "albumin": [1, 5, 20],
    "bicarbonate": [0, 50, 20],
    "bilirubin": [0, 20, 20],
    "creatinine": [0, 10, 20],
    "diastolic blood pressure": [25, 125, 20],
    "fraction inspired oxygen": [0, 1.1, 21],
    "glucose": [0, 600, 20],
    "heart rate": [25, 150, 25],
    "height": [140, 200, 20],
    "hematocrit": [0, 70, 20],
    "hemoglobin": [0, 30, 20],
    "lactate": [0, 15, 20],
    "mean blood pressure": [50, 150, 20],
    "oxygen saturation": [90, 101, 20],
    "partial pressure of carbon dioxide": [20, 100, 20],
    "partial pressure of oxygen": [0, 500, 20],
    "ph": [7, 7.6, 10],
    "platelets": [0, 800, 20],
    "positive end-expiratory pressure": [5, 25, 20],
    "prothrombin time": [1, 4, 20],
    "red blood cell count": [2, 6, 20],
    "red blood cell distribution width": [10, 30, 20],
    "respiratory rate": [0, 50, 20],
    "systolic blood pressure": [50, 200, 20],
    "temperature": [35, 40, 20],
    "troponin-t": [0, 5, 20],
    "urine output": [0, 700, 20],
    "weight": [0, 250, 20],
    "white blood cell count": [0, 50, 20],
    "Age": [20, 100, 20],
}

discrete_dict = {
    "pulse",
    "Ethnicity",
    "Gender",
    "vent"
}

In [12]:
os.makedirs("./image_files/", exist_ok=True)

In [15]:
# Continuous Plots
for key, value in cont_dict.items():
    x = combined_all[[key, "id"]]
    x[key] = np.clip(x[key], value[0], value[1])
    x = x.dropna()
    fig = sns.histplot(x, x=key, hue="id", bins=value[2], binrange=[value[0], value[1]], stat="proportion", common_norm=False, element="step")
    plt.savefig("./image_files/" + key + ".png")
    plt.clf()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

<Figure size 432x288 with 0 Axes>

In [17]:
df = combined_all[["Gender", "id"]]
df = df.dropna()
df.loc[df["Gender"] == 1.0, "Gender"] = "Female"
df.loc[df["Gender"] == 2.0, "Gender"] = "Male"

x, y = 'Gender', 'id'

(df
.groupby(y)[x]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x, y='percent', hue=y, kind='bar', hue_order=["All", "ARDS"]))
plt.savefig("./image_files/" + "Gender" + ".png")
plt.clf()

<Figure size 423.75x360 with 0 Axes>

In [19]:
df = combined_all[["Ethnicity", "id"]]
df = df.dropna()
df.loc[df["Ethnicity"] == 1.0, "Ethnicity"] = "Asian"
df.loc[df["Ethnicity"] == 2.0, "Ethnicity"] = "Black"
df.loc[df["Ethnicity"] == 3.0, "Ethnicity"] = "Latino"
df.loc[df["Ethnicity"] == 4.0, "Ethnicity"] = "Caucasian"
df.loc[df["Ethnicity"] == 0.0, "Ethnicity"] = "Other"

x, y = 'Ethnicity', 'id'

(df
.groupby(y)[x]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x, y='percent', hue=y, kind='bar', hue_order=["All", "ARDS"]))
plt.savefig("./image_files/" + "Ethnicity" + ".png")
plt.clf()

<Figure size 423.75x360 with 0 Axes>

In [29]:
listfile_train = "../data/in-hospital-mortality_v4/train_listfile.csv"
train_mort = pd.read_csv(listfile_train)
listfile_test = "../data/in-hospital-mortality_v4/test_listfile.csv"
test = pd.read_csv(listfile_test)
full_mort = pd.concat([train_mort, test])
full_mort["id"] = "All"
full_mort = full_mort.rename({"y_true": "mortality"}, axis=1)

listfile_train = "../data/ards_ihm/train_listfile.csv"
train_mort = pd.read_csv(listfile_train)
listfile_test = "../data/ards_ihm/test_listfile.csv"
test = pd.read_csv(listfile_test)
ards_mort = pd.concat([train_mort, test])
ards_mort["id"] = "ARDS"
ards_mort = ards_mort.rename({"y_true": "mortality"}, axis=1)

all_mort = pd.concat([full_mort, ards_mort])
all_mort = all_mort.reset_index()

In [32]:
df = all_mort[["mortality", "id"]]
df = df.dropna()

x, y = 'mortality', 'id'

(df
.groupby(y)[x]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x, y='percent', hue=y, kind='bar', hue_order=["All", "ARDS"]))
plt.savefig("./image_files/" + "mortality" + ".png")
plt.clf()

<Figure size 423.75x360 with 0 Axes>

In [34]:
df = combined_all[["pulse", "id"]]
df = df.dropna()

df.loc[df["pulse"] == 0.0, "pulse"] = "Absent"
df.loc[df["pulse"] == 1.0, "pulse"] = "Difficult"
df.loc[df["pulse"] == 2.0, "pulse"] = "Weak"
df.loc[df["pulse"] == 3.0, "pulse"] = "Doppler"

x, y = 'pulse', 'id'

(df
.groupby(y)[x]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x, y='percent', hue=y, kind='bar', hue_order=["All", "ARDS"]))
plt.savefig("./image_files/" + "pulse" + ".png")
plt.clf()

<Figure size 423.75x360 with 0 Axes>

In [36]:
df = combined_all[["vent", "id"]]
df = df.dropna()

df.loc[df["vent"] == 0.0, "vent"] = "None"
df.loc[df["vent"] == 1.0, "vent"] = "Non-Invasive"
df.loc[df["vent"] == 2.0, "vent"] = "Invasive"

x, y = 'vent', 'id'

(df
.groupby(y)[x]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x, y='percent', hue=y, kind='bar', hue_order=["All", "ARDS"]))
plt.savefig("./image_files/" + "vent" + ".png")
plt.clf()

<Figure size 423.75x360 with 0 Axes>