In [None]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("max_colwidth", 200)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


import os

os.environ["R_HOME"] = "/root/miniconda3/envs/R/lib/R"

import rpy2.robjects as objects
from rpy2.robjects.packages import importr

base = importr("base")
r_pROC = importr("pROC")
base._libPaths()[0]


def df_stats(df):
    from tabulate import tabulate

    print("\n***** Shape: ", df.shape, " *****\n")

    columns_list = df.columns.values.tolist()
    isnull_list = df.isnull().sum().values.tolist()
    isunique_list = df.nunique().values.tolist()
    dtypes_list = df.dtypes.tolist()

    list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
    df_stat_val = pd.DataFrame(
        list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"]
    )
    print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

In [None]:
def split_title_line(title_text, split_on="(", max_words=25):  # , max_words=None):
    """
    A function that splits any string based on specific character
    (returning it with the string), with maximum number of words on it
    """
    split_at = title_text.find(split_on)
    ti = title_text
    if split_at > 1:
        ti = ti.split(split_on)
        for i, tx in enumerate(ti[1:]):
            ti[i + 1] = split_on + tx
    if type(ti) == str:
        ti = [ti]
    for j, td in enumerate(ti):
        if td.find(split_on) > 0:
            pass
        else:
            tw = td.split()
            t2 = []
            for i in range(0, len(tw), max_words):
                t2.append(" ".join(tw[i : max_words + i]))
            ti[j] = t2
    ti = [item for sublist in ti for item in sublist]
    ret_tex = []
    for j in range(len(ti)):
        for i in range(0, len(ti) - 1, 2):
            if len(ti[i].split()) + len(ti[i + 1].split()) <= max_words:
                mrg = " ".join([ti[i], ti[i + 1]])
                ti = [mrg] + ti[2:]
                break
    try:
        if len(ti[-2].split()) + len(ti[-1].split()) <= max_words:
            mrg = " ".join([ti[-2], ti[-1]])
            ti = ti[:-2] + [mrg]
        return "\n".join(ti)
    except:
        return ti

In [None]:
df_output = pd.read_csv("data/20221002_ECG_mod_diagnosis.csv")

In [None]:
## Restrict df_output to 1000 most frequent diagnoses
df_output_m = df_output.groupby("Diagnosis").filter(lambda x: len(x) > 10)
display(len(df_output_m.Diagnosis.value_counts()))
## Randomly sample 2 of df_output['Diagnosis]
df_output_sampled = (
    df_output_m.groupby("Diagnosis")
    .apply(lambda x: x.sample(2, random_state=1))
    .reset_index(drop=True)
)
display(df_output_sampled.Diagnosis.value_counts())
df_output_sampled.to_csv("data/20221002_ECG_mod_diagnosis_sampled_3600.csv")

In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm

output_dir = "/ekg_JPEG/"
path_to_output = os.getcwd() + output_dir

if not os.path.exists(path_to_output):
    os.mkdir(path_to_output)

for index, row in tqdm(df_output_sampled.iterrows()):
    path = os.path.join(row["ecg_output_path"])
    file = np.load(path)
    file = np.reshape(file, (1, 2500, 12))

    # To reconstruct the 12 lead ecg from the array
    lead_order = [
        "I",
        "II",
        "III",
        "aVR",
        "aVL",
        "aVF",
        "V1",
        "V2",
        "V3",
        "V4",
        "V5",
        "V6",
    ]
    plt.rcParams["figure.figsize"] = [16, 9]
    ## Do not display plot in notebook
    plt.ioff()
    fig, axs = plt.subplots(len(lead_order))
    for i in range(12):
        if i == 0:
            axs[i].set_title(split_title_line(row["Diagnosis"]))
        axs[i].plot(file[0][:, i])
        axs[i].set(ylabel=str(lead_order[i]))

    ## Save plt to JPEG
    AcquisitionDateTime = (
        row["AcquisitionDate"] + "_" + row["AcquisitionTime"].replace(":", "-")
    )

    filename = f"{row['patientid']}_{AcquisitionDateTime}.png"

    file_output = path_to_output + filename
    # print(path_to_output)

    plt.savefig(file_output, dpi=300, bbox_inches="tight")

### Pre processing for NN

In [None]:
npyfilespath = os.getcwd() + "/ekg_waveforms_output/"
os.chdir(npyfilespath)
npfiles = glob.glob("*.npy")
npfiles.sort()
all_arrays_train = []
# all_arrays_eval = []

# If trying to test model quickly use smaller total dataset or change dataloader to load npy file batch by batch
# Not rewriting stacked array so below is commented out

for i, npfile in enumerate(npfiles):
    x = 0
    i = 0
    try:
        path = os.path.join(npyfilespath + npfile)
        file = np.load(path)

        file = np.reshape(file, (1, 2500, 12))
        all_arrays_train.append(file)
        x += 1
        i += 1
    except:
        continue
    if i % 1 == 100:
        print("{i} EKGs have been written to array")

In [None]:
all_arrays_train = np.array(all_arrays_train)
reshaped = np.reshape(all_arrays_train, (all_arrays_train.shape[0], 2500, 12))