In [1]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("max_colwidth", 200)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


import os

os.environ["R_HOME"] = "/root/miniconda3/envs/R/lib/R"

import rpy2.robjects as objects
from rpy2.robjects.packages import importr

base = importr("base")
r_pROC = importr("pROC")
base._libPaths()[0]


def df_stats(df):
    from tabulate import tabulate

    print("\n***** Shape: ", df.shape, " *****\n")

    columns_list = df.columns.values.tolist()
    isnull_list = df.isnull().sum().values.tolist()
    isunique_list = df.nunique().values.tolist()
    dtypes_list = df.dtypes.tolist()

    list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
    df_stat_val = pd.DataFrame(
        list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"]
    )
    print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

  from IPython.core.display import HTML, display


In [2]:
# this is muse_xml_to_array.py
# Input a directory of XML files, get a directory of np arrays where each .npy is a 12-lead ecg shape 2500,12,1. So this gives you JUST the waveforms
# Some notes, the unique ECG ID index key used in MUSE backend does not exist in the XML at least for us, so instead we use MRN_AcquisitionDTTM_PharmaUniqueECGID

# In terminal run python3 muse_xml_to_array.py <LOCATION_OF_XML_FILES>

import argparse
import base64
import os
import struct
import sys

import numpy as np
import pandas as pd
import xmltodict


def file_path(path):
    filepath = path
    for dirName, subdirList, fileList in os.walk(filepath):
        for filename in fileList:
            if ".xml" in filename.lower():
                ekg_file_list.append(os.path.join(dirName, filename))


# need to update this function to check the output directory for the output file and then only on newly added EKGs
# add timestamp to start file string
# this is annoying because the XML file name is a random timestamp and the output file is the UniqueECGID


if not os.path.exists(os.getcwd() + "/ekg_waveforms_output/"):
    os.mkdir(os.getcwd() + "/ekg_waveforms_output/")

# parser = argparse.ArgumentParser(description='Input and outputs for XML EKG parsing')
# parser.add_argument('input', type=str)
# parser.set_defaults(output=os.getcwd() + '/ekg_waveforms_output/') #ensure this directory already exists

# args = parser.parse_args()


def decode_ekg_muse(raw_wave):
    """
    Ingest the base64 encoded waveforms and transform to numeric
    """
    # covert the waveform from base64 to byte array
    arr = base64.b64decode(bytes(raw_wave, "utf-8"))

    # unpack every 2 bytes, little endian (16 bit encoding)
    unpack_symbols = "".join([char * (len(arr) // 2) for char in "h"])
    byte_array = struct.unpack(unpack_symbols, arr)
    return byte_array


def decode_ekg_muse_to_array(raw_wave, downsample=1):
    """
    Ingest the base64 encoded waveforms and transform to numeric

    downsample: 0.5 takes every other value in the array. Muse samples at 500/s and the sample model requires 250/s. So take every other.
    """
    try:
        dwnsmpl = int(1 // downsample)
    except ZeroDivisionError:
        print("You must downsample by more than 0")
    # covert the waveform from base64 to byte array
    arr = base64.b64decode(bytes(raw_wave, "utf-8"))

    # unpack every 2 bytes, little endian (16 bit encoding)
    unpack_symbols = "".join([char * int(len(arr) / 2) for char in "h"])
    byte_array = struct.unpack(unpack_symbols, arr)
    return np.array(byte_array)[::dwnsmpl]


def xml_to_np_array_file(path_to_xml, path_to_output=os.getcwd()):

    with open(path_to_xml, "rb") as fd:
        dic = xmltodict.parse(fd.read().decode("utf8"))

    """
    
    Upload the ECG as numpy array with shape=[2500,12,1] ([time, leads, 1]).

    The voltage unit should be in 1 mv/unit and the sampling rate should be 250/second (total 10 second).

    The leads should be ordered as follow I, II, III, aVR, aVL, aVF, V1, V2, V3, V4, V5, V6.

    """
    # print(dic)
    try:
        pt_id = dic["RestingECG"]["PatientDemographics"]["PatientID"]
    except:
        print("no PatientID")
        pt_id = "none"
    try:
        AcquisitionDateTime = (
            dic["RestingECG"]["TestDemographics"]["AcquisitionDate"]
            + "_"
            + dic["RestingECG"]["TestDemographics"]["AcquisitionTime"].replace(":", "-")
        )
    except:
        print("no AcquisitionDateTime")
        AcquisitionDateTime = "none"

    # try:
    #     requisition_number = dic['RestingECG']['Order']['RequisitionNumber']
    # except:
    #     print("no requisition_number")
    #     requisition_number = "none"

    # need to instantiate leads in the proper order for the model
    lead_order = [
        "I",
        "II",
        "III",
        "aVR",
        "aVL",
        "aVF",
        "V1",
        "V2",
        "V3",
        "V4",
        "V5",
        "V6",
    ]

    """
    Each EKG will have this data structure:
    lead_data = {
        'I': np.array
    }
    """

    lead_data = dict.fromkeys(lead_order)
    # lead_data = {leadid: None for k in lead_order}

    #     for all_lead_data in dic['RestingECG']['Waveform']:
    #         for single_lead_data in lead['LeadData']:
    #             leadname =  single_lead_data['LeadID']
    #             if leadname in (lead_order):
    #try:
    for lead in dic["RestingECG"]["Waveform"]:
        for leadid in range(len(lead["LeadData"])):
            sample_length = len(
                decode_ekg_muse_to_array(lead["LeadData"][leadid]["WaveFormData"])
            )
            # sample_length is equivalent to dic['RestingECG']['Waveform']['LeadData']['LeadSampleCountTotal']
            if sample_length == 5000:
                lead_data[
                    lead["LeadData"][leadid]["LeadID"]
                ] = decode_ekg_muse_to_array(
                    lead["LeadData"][leadid]["WaveFormData"], downsample=0.5
                )
            elif sample_length == 2500:
                lead_data[
                    lead["LeadData"][leadid]["LeadID"]
                ] = decode_ekg_muse_to_array(
                    lead["LeadData"][leadid]["WaveFormData"], downsample=1
                )
            else:
                continue
        # ensures all leads have 2500 samples and also passes over the 3 second waveform

    lead_data["III"] = np.array(lead_data["II"]) - np.array(lead_data["I"])
    lead_data["aVR"] = -(np.array(lead_data["I"]) + np.array(lead_data["II"])) / 2
    lead_data["aVF"] = (np.array(lead_data["II"]) + np.array(lead_data["III"])) / 2
    lead_data["aVL"] = (np.array(lead_data["I"]) - np.array(lead_data["III"])) / 2

    lead_data = {k: lead_data[k] for k in lead_order}
    # drops V3R, V4R, and V7 if it was a 15-lead ECG

    # now construct and reshape the array
    # converting the dictionary to an np.array
    temp = []
    for key, value in lead_data.items():
        temp.append(value)

    # transpose to be [time, leads, ]
    ekg_array = np.array(temp).T

    # expand dims to [time, leads, 1]
    ekg_array = np.expand_dims(ekg_array, axis=-1)

    # Here is a check to make sure all the model inputs are the right shape
    #     assert ekg_array.shape == (2500, 12, 1), "ekg_array is shape {} not (2500, 12, 1)".format(ekg_array.shape )

    # filename = '/ekg_waveform_{}_{}.npy'.format(pt_id, requisition_number)
    filename = f"{pt_id}_{AcquisitionDateTime}.npy"

    path_to_output += filename
    # print(path_to_output)
    with open(path_to_output, "wb") as f:
        np.save(f, ekg_array)
    return path_to_output

    #except:
    #    print("error", dic)
    #    return None


def ekg_batch_run(ekg_list):
    i = 0
    x = 0
    for file in ekg_list:
        try:
            xml_to_np_array_file(file, output_dir)
            i += 1
        except Exception as e:
            # print("file failed: ", file)
            print(file, e)
            x += 1
        if i % 10000 == 0:
            print(f"Succesfully converted {i} EKGs, failed converting {x} EKGs")

In [3]:
def generate_ecg_dataframe_and_npy(df, output_dir="/ekg_waveforms_output/"):
    output_dir = os.getcwd() + output_dir

    from ECGXMLReader import ECGXMLReader
    from tqdm import tqdm

    patientid_list = []
    patientage_list = []
    patient_date_of_birth_list = []
    patient_gender_list = []
    patient_VentricularRate_list = []
    patient_AtrialRate_list = []
    patient_PRInterval_list = []
    patient_QRSDuration_list = []
    patient_QTInterval_list = []
    patient_QTCorrected_list = []
    patient_Paxis_list = []
    patient_Raxis_list = []
    patient_TAxis_list = []
    patient_QRSCount_list = []
    patient_QOnset_list = []
    patient_QOffset_list = []
    patient_POnset_list = []
    patient_POffset_list = []
    patient_TOffset_list = []
    patient_ECGSampleBase_list = []
    patient_ECGSampleExponent_list = []
    patient_QTcFrederica_list = []
    patient_Location_list = []
    patient_LocatioName_list = []
    patient_RoomID_list = []
    patient_acquisitiondate_list = []
    patient_acquisitiontime_list = []
    patient_status_list = []
    patient_acquisitiondevice_list = []
    patient_referringMDLastName_list = []
    patient_AnalysisSoftware_list = []
    patient_acquisitionSoftwareVersion_list = []
    diagnosis_list = []
    original_diagnosis_list = []
    ecg_output_path_list = []
    xml_path_list = []

    for index, row in tqdm(df.iterrows()):
        ecg = ECGXMLReader(row["path"], augmentLeads=True)
        xml_path_list.append(row["path"])
        patientid_list.append(ecg.PatientDemographics["PatientID"])

        try:
            patientage_list.append(ecg.PatientDemographics["PatientAge"])
        except:
            patientage_list.append(np.nan)

        try:
            patient_date_of_birth_list.append(ecg.PatientDemographics["DateofBirth"])
        except:
            patient_date_of_birth_list.append(np.nan)
        try:
            patient_gender_list.append(ecg.PatientDemographics["Gender"])
        except:
            patient_gender_list.append(np.nan)

        try:
            patient_VentricularRate_list.append(
                ecg.RestingECGMeasurements["VentricularRate"]
            )
        except:
            patient_VentricularRate_list.append(np.nan)

        try:
            patient_AtrialRate_list.append(ecg.RestingECGMeasurements["AtrialRate"])
        except:
            patient_AtrialRate_list.append(np.nan)
        try:
            patient_PRInterval_list.append(ecg.RestingECGMeasurements["PRInterval"])
        except:
            patient_PRInterval_list.append(np.nan)
        try:
            patient_QRSDuration_list.append(ecg.RestingECGMeasurements["QRSDuration"])
        except:
            patient_QRSDuration_list.append(np.nan)
        try:
            patient_QTInterval_list.append(ecg.RestingECGMeasurements["QTInterval"])
        except:
            patient_QTInterval_list.append(np.nan)

        try:
            patient_QTCorrected_list.append(ecg.RestingECGMeasurements["QTCorrected"])
        except:
            patient_QTCorrected_list.append(np.nan)
        try:
            patient_Paxis_list.append(ecg.RestingECGMeasurements["PAxis"])
        except:
            patient_Paxis_list.append(np.nan)

        try:
            patient_Raxis_list.append(ecg.RestingECGMeasurements["RAxis"])
        except:
            patient_Raxis_list.append(np.nan)
        try:
            patient_TAxis_list.append(ecg.RestingECGMeasurements["TAxis"])
        except:
            patient_TAxis_list.append(np.nan)

        try:
            patient_QRSCount_list.append(ecg.RestingECGMeasurements["QRSCount"])
        except:
            patient_QRSCount_list.append(np.nan)
        try:
            patient_QOnset_list.append(ecg.RestingECGMeasurements["QOnset"])
        except:
            patient_QOnset_list.append(np.nan)

        try:
            patient_QOffset_list.append(ecg.RestingECGMeasurements["QOffset"])
        except:
            patient_QOffset_list.append(np.nan)
        try:
            patient_POnset_list.append(ecg.RestingECGMeasurements["POnset"])
        except:
            patient_POnset_list.append(np.nan)
        try:
            patient_POffset_list.append(ecg.RestingECGMeasurements["POffset"])
        except:
            patient_POffset_list.append(np.nan)
        try:
            patient_TOffset_list.append(ecg.RestingECGMeasurements["TOffset"])
        except:
            patient_TOffset_list.append(np.nan)

        patient_ECGSampleBase_list.append(ecg.RestingECGMeasurements["ECGSampleBase"])
        patient_ECGSampleExponent_list.append(
            ecg.RestingECGMeasurements["ECGSampleExponent"]
        )
        try:
            patient_QTcFrederica_list.append(ecg.RestingECGMeasurements["QTcFrederica"])
        except:
            patient_QTcFrederica_list.append(np.nan)

        patient_Location_list.append(ecg.TestDemographics["Location"])

        try:
            patient_LocatioName_list.append(ecg.TestDemographics["LocationName"])
        except:
            patient_LocatioName_list.append(np.nan)

        try:
            patient_RoomID_list.append(ecg.TestDemographics["RoomID"])
        except:
            patient_RoomID_list.append("None")
        try:
            patient_acquisitiondate_list.append(ecg.TestDemographics["AcquisitionDate"])
        except:
            patient_acquisitiondate_list.append(np.nan)

        try:
            patient_acquisitiontime_list.append(ecg.TestDemographics["AcquisitionTime"])
        except:
            patient_acquisitiontime_list.append(np.nan)

        patient_status_list.append(ecg.TestDemographics["Status"])

        try:
            patient_acquisitiondevice_list.append(
                ecg.TestDemographics["AcquisitionDevice"]
            )
        except:
            patient_acquisitiondevice_list.append(np.nan)

        try:
            patient_referringMDLastName_list.append(
                ecg.TestDemographics["ReferringMDLastName"]
            )
        except:
            patient_referringMDLastName_list.append("None")
        try:
            patient_AnalysisSoftware_list.append(
                ecg.TestDemographics["AnalysisSoftwareVersion"]
            )
        except:
            patient_AnalysisSoftware_list.append(np.nan)
        try:
            patient_acquisitionSoftwareVersion_list.append(
                ecg.TestDemographics["AcquisitionSoftwareVersion"]
            )
        except:
            patient_acquisitionSoftwareVersion_list.append(np.nan)

        diagnosis = []
        print(ecg)
        try:
            for key in ecg.Diagnosis["DiagnosisStatement"]:
                # print(key['StmtText'])
                try:
                    diagnosis.append(key["StmtText"])
                except:
                    diagnosis.append(key["ENDSLINE"])

            ##merge items in diagnosis list into a single string
            diagnosis = " ".join(diagnosis)

            diagnosis_list.append(diagnosis)
        except:
            print(ecg.TestDemographics)
            print(ecg.PatientDemographics)
            print(ecg.RestingECGMeasurements)
            print(ecg.PatientDemographics["PatientID"])
            diagnosis_list.append(-1)

        diagnosis = []
        try:
            for key in ecg.OriginalDiagnosis["DiagnosisStatement"]:
                # print(key['StmtText'])
                try:
                    diagnosis.append(key["StmtText"])
                except:
                    diagnosis.append(key["ENDSLINE"])

            ##merge items in diagnosis list into a single string
            diagnosis = " ".join(diagnosis)

            original_diagnosis_list.append(diagnosis)
        except:
            print(ecg.TestDemographics)
            print(ecg.PatientDemographics)
            print(ecg.RestingECGMeasurements)
            print(ecg.PatientDemographics["PatientID"])
            original_diagnosis_list.append(-1)

            # display(ecg.Diagnosis['DiagnosisStatement'])
            # break
        # print(ecg.TestDemographics)
        # print(ecg.PatientDemographics)
        # print(ecg.RestingECGMeasurements)
        # print(ecg.Diagnosis)
        # print(ecg.OriginalDiagnosis)
        ecg_output_path = xml_to_np_array_file(row["path"], output_dir)
        ecg_output_path_list.append(ecg_output_path)
    ##Create dataaframe with the previous lists
    df_output = pd.DataFrame(
        {
            "patientid": patientid_list,
            "age": patientage_list,
            "dob": patient_date_of_birth_list,
            "gender": patient_gender_list,
            "VentricularRate": patient_VentricularRate_list,
            "AtrialRate": patient_AtrialRate_list,
            "PRInterval": patient_PRInterval_list,
            "QRSDuration": patient_QRSDuration_list,
            "QTInterval": patient_QTInterval_list,
            "QTCorrected": patient_QTCorrected_list,
            "PAxis": patient_Paxis_list,
            "RAXis": patient_Raxis_list,
            "TAxis": patient_TAxis_list,
            "QRSCount": patient_QRSCount_list,
            "QOnset": patient_QOnset_list,
            "QOffset": patient_QOffset_list,
            "POnset": patient_POnset_list,
            "POffset": patient_POffset_list,
            "TOffset": patient_TOffset_list,
            "ECGSampleBase": patient_ECGSampleBase_list,
            "ECGSampleExponent": patient_ECGSampleExponent_list,
            "QTcFrederica": patient_QTcFrederica_list,
            "Location": patient_Location_list,
            "LocationName": patient_LocatioName_list,
            "RoomID": patient_RoomID_list,
            "AcquisitionDate": patient_acquisitiondate_list,
            "AcquisitionTime": patient_acquisitiontime_list,
            "Status": patient_status_list,
            "AcquisitionDevice": patient_acquisitiondevice_list,
            "ReferringMDLastName": patient_referringMDLastName_list,
            "AnalysisSoftware": patient_AnalysisSoftware_list,
            "AcquisitionSoftwareVersion": patient_acquisitionSoftwareVersion_list,
            "Diagnosis": diagnosis_list,
            "Original_Diagnosis": original_diagnosis_list,
            "xml_path": xml_path_list,
            "ecg_output_path": ecg_output_path_list,
        }
    )
    return df_output

### Get all files

In [4]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("max_colwidth", 200)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


import os

os.environ["R_HOME"] = "/root/miniconda3/envs/R/lib/R"

import rpy2.robjects as objects
from rpy2.robjects.packages import importr

base = importr("base")
r_pROC = importr("pROC")
base._libPaths()[0]


def df_stats(df):
    from tabulate import tabulate

    print("\n***** Shape: ", df.shape, " *****\n")

    columns_list = df.columns.values.tolist()
    isnull_list = df.isnull().sum().values.tolist()
    isunique_list = df.nunique().values.tolist()
    dtypes_list = df.dtypes.tolist()

    list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
    df_stat_val = pd.DataFrame(
        list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"]
    )
    print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

  from IPython.core.display import HTML, display


In [5]:
### List all files in '/media/data1/muse_ge/ecg_retrospective' ending in XML and add them to list
#import glob
#import os

## Get all fins in directory 'media/data1/muse_ge/ecg_retrospective' ending in .XML
#path = "/media/data1/muse_ge/ecg_retrospective"
#all_files = glob.glob(os.path.join(path, "*.xml"))
### Create dataframe with all ECG files
#df = pd.DataFrame(all_files, columns=["path"])
#display(df)

# 1711846 files as of 2022

In [None]:
df = pd.read_csv("data/20230313_ECG_path.csv")

In [None]:
display(df_describe.())

In [6]:
# df_output['ecg_abnormal'] = np.where(df_output['Diagnosis'].str.contains('ECG anormal'), 1, np.where(df_output['Diagnosis'].str.contains('ECG normal'), 0, -1))
# Remove ECG anormal and ECG normal from diagnosis
# df_output['Diagnosis'] = df_output['Diagnosis'].str.replace('ECG anormal', '')
# df_output['Original_Diagnosis'] = df_output['Original_Diagnosis'].str.replace('ECG normal', '')
# df_output['Original_Diagnosis'] = df_output['Diagnosis'].str.replace('ECG anormal', '')
# df_output['Diagnosis'] = df_output['Original_Diagnosis'].str.replace('ECG normal', '')
# df_output.to_csv('data/20221002_ECG_mod_diagnosis.csv')

In [None]:
df_m = (
    df_output.groupby(["patientid", "AcquisitionDate", "AcquisitionTime"])
    .first()
    .reset_index()
)
display(df_stats(df_output))
display(df_stats(df_m))
## The ECGs can be grouped by patient id, date and time to have a 1 unique row per ECG - this means the filename to save the ECG also needs to have the date and time in it.

In [None]:
# output_dir = os.getcwd() + '/ekg_waveforms_output/'
# ekg_batch_run(df['path'][0:1])

In [None]:
## Display top 1000 most frequent df_output['Diagnosis]
display(df_output["Diagnosis"].value_counts()[0:1000])

### Alexis Script

In [1]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("max_colwidth", 200)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


import os

os.environ["R_HOME"] = "/root/miniconda3/envs/R/lib/R"

import rpy2.robjects as objects
from rpy2.robjects.packages import importr

base = importr("base")
r_pROC = importr("pROC")
base._libPaths()[0]


def df_stats(df):
    from tabulate import tabulate

    print("\n***** Shape: ", df.shape, " *****\n")

    columns_list = df.columns.values.tolist()
    isnull_list = df.isnull().sum().values.tolist()
    isunique_list = df.nunique().values.tolist()
    dtypes_list = df.dtypes.tolist()

    list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
    df_stat_val = pd.DataFrame(
        list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"]
    )
    print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

  from IPython.core.display import HTML, display


In [14]:
df = pd.read_csv("data/20230313_ECG_path.csv")

In [None]:
import CLI_xml2df as xml2df

df_output = xml2df.tinyxml2df(
    df["path"], out_path="data/ekg_waveforms_output/"
).read2flatten()

Transforming xml files into dict:   0%|                                                                                                                                                                      | 46/1633856 [00:06<31:43:21, 14.31it/s]

error {'RestingECG': {'MuseInfo': {'MuseVersion': '9.0.10.18530'}, 'PatientDemographics': {'PatientID': '0799999', 'PatientAge': '81', 'AgeUnits': 'YEARS', 'DateofBirth': '06-21-1940', 'Gender': 'MALE', 'PatientLastName': 'MEDISOL', 'PatientFirstName': 'TEST'}, 'TestDemographics': {'DataType': 'RESTING', 'Site': '1', 'SiteName': 'INSTITUT DE CARDIOLOGIE DE MTL', 'Status': 'CONFIRMED', 'EditListStatus': 'Confirmed', 'Priority': 'NORMAL', 'Location': '1', 'LocationName': '1_CARDIOLOGIE GENERALE', 'AcquisitionTime': '14:30:57', 'AcquisitionDate': '11-01-2021', 'EditTime': '10:57:44', 'EditDate': '11-02-2021', 'OverreaderID': '20001', 'EditorID': '4846', 'OrderingMDLastName': 'Md I.C.M.', 'OrderingMDFirstName': 'A Confirmer', 'ReferringMDLastName': 'Md I.C.M.', 'ReferringMDFirstName': 'A Confirmer', 'OverreaderLastName': 'MuseAdmin', 'OverreaderFirstName': 'MuseAdmin', 'EditorLastName': 'PELLERIN', 'EditorFirstName': 'SANDRA', 'HISStatus': '2'}, 'Order': {'RequisitionNumber': '21109084', '

Transforming xml files into dict:   0%|                                                                                                                                                                     | 1086/1633856 [00:36<8:55:36, 50.81it/s]

In [None]:
df_output.to_csv('data/20230328_ECG_path_diagnosis_fixed_with_NPY.csv')

In [5]:
display(df_output.columns)

Index(['RestingECG_MuseInfo_MuseVersion', 'RestingECG_PatientDemographics_PatientID', 'RestingECG_PatientDemographics_PatientAge', 'RestingECG_PatientDemographics_AgeUnits', 'RestingECG_PatientDemographics_DateofBirth', 'RestingECG_PatientDemographics_Gender', 'RestingECG_PatientDemographics_PatientLastName', 'RestingECG_PatientDemographics_PatientFirstName', 'RestingECG_TestDemographics_DataType', 'RestingECG_TestDemographics_Site',
       ...

In [None]:
npy_array = np.load(df_output["npy_path"][0])
# Transpose the array to shape=[12, 2500]
ecg_transposed = np.transpose(npy_array, (1, 0, 2))
ecg_transposed = ecg_transposed.reshape(12, 2500)

import ecg_plot

ecg_plot.plot(ecg_transposed, sample_rate=250, title="ECG 12")
ecg_plot.show()

In [None]:
## Plot line of ecg_transposed[0] and ecg_transposed[1]
import matplotlib.pyplot as plt

plt.plot(ecg_transposed[0])
plt.plot(ecg_transposed[1])
plt.show()

### Propagate diagnoses labels

In [1]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("max_colwidth", 200)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import HTML, display


In [50]:
import re

import pandas as pd
from unidecode import unidecode


def normalize_string(s):
    s = unidecode(s)
    s = s.strip()
    s = re.sub(r"\W+", " ", s)
    return s


def contains_eligible_diagnosis(normalized_diag, eligible_diagnoses):
    for diagnosis in eligible_diagnoses:
        if diagnosis in normalized_diag:
            return diagnosis
            break
    return False


def filter_ecg_by_diagnosis(df_ecg, df_ecg_sampled):
    df_ecg["Normalized_Diag"] = df_ecg["Original_Diag"].apply(normalize_string)
    df_ecg_sampled["Normalized_Diagnoses"] = df_ecg_sampled["Original_Diagnosis"].apply(
        normalize_string
    )

    eligible_diagnoses = set(df_ecg_sampled["Normalized_Diagnoses"])

    df_ecg["Contains_Eligible_Diagnosis"] = df_ecg["Normalized_Diag"].apply(
        lambda x: contains_eligible_diagnosis(x, eligible_diagnoses)
    )

    return df_ecg

In [6]:
df_ecg = pd.read_parquet(
    "/media/data1/ravram/DeepECG/ekg_waveforms_output/df_xml_2023_03_14_n_1633856.parquet"
)

In [12]:
display(df_ecg.RestingECG_QRSTimesTypes_QRS_27_Time.value_counts())

2416    15431
2431    13628
2432    13219
2273    12769
9114    11515
        ...  
1642        1
2002        1
2098        1
1749        1
1831        1
Name: RestingECG_QRSTimesTypes_QRS_27_Time, Length: 785, dtype: int64

In [9]:
columns_in_df1 = set(df_output.columns)
columns_in_df2 = set(df_ecg.columns)
# Find the columns present in df1 but not in df2
columns_not_in_df2 = columns_in_df2.difference(columns_in_df1)

print("Columns in df1 that are not in df2:", columns_not_in_df2)

Columns in df1 that are not in df2: {'Diag', 'RestingECG_Diagnosis_DiagnosisStatement_18_StmtFlag_0', 'RestingECG_OriginalDiagnosis_DiagnosisStatement_5_StmtFlag_0', 'RestingECG_QRSTimesTypes_QRS_21_Time', 'RestingECG_QRSTimesTypes_QRS_27_Number', 'RestingECG_Diagnosis_DiagnosisStatement_12_StmtText', 'RestingECG_Diagnosis_DiagnosisStatement_7_StmtText', 'RestingECG_Diagnosis_DiagnosisStatement_17_StmtFlag_0', 'RestingECG_Diagnosis_DiagnosisStatement_21_StmtFlag_1', 'RestingECG_Diagnosis_DiagnosisStatement_13_StmtFlag', 'RestingECG_QRSTimesTypes_QRS_22_Time', 'RestingECG_Order_AdmittingMDLastName', 'RestingECG_QRSTimesTypes_QRS_27_Time', 'RestingECG_Diagnosis_DiagnosisStatement_15_StmtFlag_0', 'RestingECG_QRSTimesTypes_QRS_22_Number', 'RestingECG_OriginalDiagnosis_DiagnosisStatement_0_StmtFlag', 'RestingECG_PatientDemographics_Race', 'RestingECG_QRSTimesTypes_QRS_19_Time', 'RestingECG_Diagnosis_DiagnosisStatement_19_StmtText', 'RestingECG_Diagnosis_DiagnosisStatement_10_StmtFlag', 'Res

In [3]:
df_ecg_sampled = pd.read_csv("data/20221002_ECG_mod_diagnosis_sampled_3600.csv")

In [12]:
display(df_ecg.loc[df_ecg['xml_path']=='/media/data1/muse_ge/ecg_retrospective/MUSE_20220816_134218_73000.xml'])

Unnamed: 0,RestingECG_MuseInfo_MuseVersion,RestingECG_PatientDemographics_PatientID,RestingECG_PatientDemographics_PatientAge,RestingECG_PatientDemographics_AgeUnits,RestingECG_PatientDemographics_DateofBirth,RestingECG_PatientDemographics_Gender,RestingECG_PatientDemographics_PatientLastName,RestingECG_PatientDemographics_PatientFirstName,RestingECG_TestDemographics_DataType,RestingECG_TestDemographics_Site,...,RestingECG_Diagnosis_DiagnosisStatement_23_StmtFlag_1,RestingECG_Diagnosis_DiagnosisStatement_23_StmtText,RestingECG_Diagnosis_DiagnosisStatement_24_StmtFlag,RestingECG_Diagnosis_DiagnosisStatement_24_StmtText,Original_Diag,Diag,warnings,xml_path,npy_path,extracted
6512,9.0.10.18530,59116,52,YEARS,01-05-1934,FEMALE,BENYOUNES,FAIZA,RESTING,1,...,,,,,Rythme sinusal normal ECG normal Infarctus ...,Rythme sinusal normal ECG normal Infarctus ...,-1,/media/data1/muse_ge/ecg_retrospective/MUSE_20...,data/ekg_waveforms_output/ecg_npy/0059116_01-0...,True


In [13]:
import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

def save_ecg_plots(df_output_sampled, output_dir, diagonsis_column='Original_Diag', figsize=(16, 9), dpi=300, lead_order=None):
    if lead_order is None:
        lead_order = [
            "I",
            "II",
            "III",
            "aVR",
            "aVL",
            "aVF",
            "V1",
            "V2",
            "V3",
            "V4",
            "V5",
            "V6",
        ]

    path_to_output = os.path.join(os.getcwd(), output_dir)
    
    if not os.path.exists(path_to_output):
        os.mkdir(path_to_output)

    for index, row in tqdm(df_output_sampled.iterrows()):
        path = os.path.join(row["npy_path"])
        file = np.load(path)
        file = np.reshape(file, (1, 2500, 12))

        plt.rcParams["figure.figsize"] = figsize
        plt.ioff()
        fig, axs = plt.subplots(len(lead_order))
        
        for i in range(12):
            if i == 0:
                axs[i].set_title(row[diagonsis_column])
            axs[i].plot(file[0][:, i])
            axs[i].set(ylabel=str(lead_order[i]))

        AcquisitionDateTime = row["RestingECG_TestDemographics_AcquisitionDate"] + "_" + row["RestingECG_TestDemographics_AcquisitionTime"].replace(":", "-")
        filename = f"{row['RestingECG_PatientDemographics_PatientID']}_{AcquisitionDateTime}.png"
        
        file_output = os.path.join(path_to_output, filename)
        print(file_output)
        
        plt.savefig(file_output, dpi=dpi, bbox_inches="tight")
        plt.close(fig)

In [47]:
save_ecg_plots(df_f[6511:6513], output_dir="sanity_check/")

0it [00:00, ?it/s]

/volume/DeepECG/sanity_check/0059116_01-06-2022_08-52-02.png


1it [00:04,  4.20s/it]

/volume/DeepECG/sanity_check/0059116_01-06-2022_08-51-29.png


2it [00:08,  4.11s/it]


In [30]:
df_f = filter_ecg_by_diagnosis(df_ecg[0:10000], df_ecg_sampled)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ecg["Normalized_Diag"] = df_ecg["Original_Diag"].apply(normalize_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ecg["Contains_Eligible_Diagnosis"] = df_ecg["Normalized_Diag"].apply(


In [56]:
display(df_f[['Original_Diag', 'Normalized_Diag', 'Contains_Eligible_Diagnosis','xml_path']].iloc[6512])

Original_Diag                    Rythme sinusal normal  ECG normal Infarctus antérieur d'âge indéterminé ECG anormal  Produit Cornell )  ECG anormal )  Sus-décalage du ST; possibilité de lésion inférieure ou infarctus aigu  ** ...
Normalized_Diag                Rythme sinusal normal ECG normal Infarctus anterieur d age indetermine ECG anormal Produit Cornell ECG anormal Sus decalage du ST possibilite de lesion inferieure ou infarctus aigu IM AIGU STEMI C...
Contains_Eligible_Diagnosis                                                                                                                                                                           Rythme sinusal normal ECG normal
xml_path                                                                                                                                                         /media/data1/muse_ge/ecg_retrospective/MUSE_20220816_134218_73000.xml
Name: 6512, dtype: object

In [None]:
df_ecg_sampled["Normalized_Diagnoses"] = df_ecg_sampled["Original_Diagnosis"].apply(
    normalize_string
)

In [None]:
df_ecg["Normalized_Diag"] = df_ecg["Original_Diag"].apply(normalize_string)

In [None]:
# Print some information to help debug the issue
eligible_diagnoses = set(df_ecg_sampled["Normalized_Diagnoses"])
print(f"Total number of eligible diagnoses: {len(eligible_diagnoses)}")
print(f"First 10 eligible diagnoses: {list(eligible_diagnoses)[:10]}")
print(
    f"First 10 normalized diagnoses in df_ecg: {list(df_ecg['Normalized_Diag'].head(10))}"
)

In [None]:
df_m = pd.merge(df_ecg_sampled, df_ecg, on="xml_path", how="inner")
df_m["Contained"] = df_m.apply(
    lambda row: row["Normalized_Diagnoses"] in row["Normalized_Diag"], axis=1
)

In [None]:
df_ecg["Contains_Eligible_Diagnosis"] = df_ecg["Normalized_Diag"].apply(
    lambda x: contains_eligible_diagnosis(x, eligible_diagnoses)
)

In [None]:
display(df_ecg_sampled["Normalized_Diagnoses"].nunique())

In [None]:
display(
    df_ecg.loc[df_ecg["Contains_Eligible_Diagnosis"] == False]["Normalized_Diag"].head(
        n=55
    )
)

In [None]:
### 1) Voir quelles données labelbox sont discordantes et me les envoyer pour annotation (et enlever ces ECG de la base de données)
### 2) Faire propagation des données labelbox concordantes sur df_ecg (db. parquet)
### 3) Dictionnaire pour df_ecg['Contains_Eligible_Diagnosis']==False
### 4) Standardiser le dataloader