In [1]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("max_colwidth", 200)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


import os

os.environ["R_HOME"] = "/root/miniconda3/envs/R/lib/R"

import rpy2.robjects as objects
from rpy2.robjects.packages import importr

base = importr("base")
r_pROC = importr("pROC")
base._libPaths()[0]


def df_stats(df):
    from tabulate import tabulate

    print("\n***** Shape: ", df.shape, " *****\n")

    columns_list = df.columns.values.tolist()
    isnull_list = df.isnull().sum().values.tolist()
    isunique_list = df.nunique().values.tolist()
    dtypes_list = df.dtypes.tolist()

    list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
    df_stat_val = pd.DataFrame(
        list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"]
    )
    print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

In [2]:
# this is muse_xml_to_array.py
# Input a directory of XML files, get a directory of np arrays where each .npy is a 12-lead ecg shape 2500,12,1. So this gives you JUST the waveforms
# Some notes, the unique ECG ID index key used in MUSE backend does not exist in the XML at least for us, so instead we use MRN_AcquisitionDTTM_PharmaUniqueECGID

# In terminal run python3 muse_xml_to_array.py <LOCATION_OF_XML_FILES>

import argparse
import base64
import os
import struct
import sys

import numpy as np
import pandas as pd
import xmltodict


def file_path(path):
    filepath = path
    for dirName, subdirList, fileList in os.walk(filepath):
        for filename in fileList:
            if ".xml" in filename.lower():
                ekg_file_list.append(os.path.join(dirName, filename))


# need to update this function to check the output directory for the output file and then only on newly added EKGs
# add timestamp to start file string
# this is annoying because the XML file name is a random timestamp and the output file is the UniqueECGID


if not os.path.exists(os.getcwd() + "/ekg_waveforms_output/"):
    os.mkdir(os.getcwd() + "/ekg_waveforms_output/")

# parser = argparse.ArgumentParser(description='Input and outputs for XML EKG parsing')
# parser.add_argument('input', type=str)
# parser.set_defaults(output=os.getcwd() + '/ekg_waveforms_output/') #ensure this directory already exists

# args = parser.parse_args()


def decode_ekg_muse(raw_wave):
    """
    Ingest the base64 encoded waveforms and transform to numeric
    """
    # covert the waveform from base64 to byte array
    arr = base64.b64decode(bytes(raw_wave, "utf-8"))

    # unpack every 2 bytes, little endian (16 bit encoding)
    unpack_symbols = "".join([char * (len(arr) // 2) for char in "h"])
    byte_array = struct.unpack(unpack_symbols, arr)
    return byte_array


def decode_ekg_muse_to_array(raw_wave, downsample=1):
    """
    Ingest the base64 encoded waveforms and transform to numeric

    downsample: 0.5 takes every other value in the array. Muse samples at 500/s and the sample model requires 250/s. So take every other.
    """
    try:
        dwnsmpl = int(1 // downsample)
    except ZeroDivisionError:
        print("You must downsample by more than 0")
    # covert the waveform from base64 to byte array
    arr = base64.b64decode(bytes(raw_wave, "utf-8"))

    # unpack every 2 bytes, little endian (16 bit encoding)
    unpack_symbols = "".join([char * int(len(arr) / 2) for char in "h"])
    byte_array = struct.unpack(unpack_symbols, arr)
    return np.array(byte_array)[::dwnsmpl]


def xml_to_np_array_file(path_to_xml, path_to_output=os.getcwd()):

    with open(path_to_xml, "rb") as fd:
        dic = xmltodict.parse(fd.read().decode("utf8"))

    """
    
    Upload the ECG as numpy array with shape=[2500,12,1] ([time, leads, 1]).

    The voltage unit should be in 1 mv/unit and the sampling rate should be 250/second (total 10 second).

    The leads should be ordered as follow I, II, III, aVR, aVL, aVF, V1, V2, V3, V4, V5, V6.

    """
    # print(dic)
    try:
        pt_id = dic["RestingECG"]["PatientDemographics"]["PatientID"]
    except:
        print("no PatientID")
        pt_id = "none"
    try:
        AcquisitionDateTime = (
            dic["RestingECG"]["TestDemographics"]["AcquisitionDate"]
            + "_"
            + dic["RestingECG"]["TestDemographics"]["AcquisitionTime"].replace(":", "-")
        )
    except:
        print("no AcquisitionDateTime")
        AcquisitionDateTime = "none"

    # try:
    #     requisition_number = dic['RestingECG']['Order']['RequisitionNumber']
    # except:
    #     print("no requisition_number")
    #     requisition_number = "none"

    # need to instantiate leads in the proper order for the model
    lead_order = [
        "I",
        "II",
        "III",
        "aVR",
        "aVL",
        "aVF",
        "V1",
        "V2",
        "V3",
        "V4",
        "V5",
        "V6",
    ]

    """
    Each EKG will have this data structure:
    lead_data = {
        'I': np.array
    }
    """

    lead_data = dict.fromkeys(lead_order)
    # lead_data = {leadid: None for k in lead_order}

    #     for all_lead_data in dic['RestingECG']['Waveform']:
    #         for single_lead_data in lead['LeadData']:
    #             leadname =  single_lead_data['LeadID']
    #             if leadname in (lead_order):
    try:
        for lead in dic["RestingECG"]["Waveform"]:
            for leadid in range(len(lead["LeadData"])):
                sample_length = len(
                    decode_ekg_muse_to_array(lead["LeadData"][leadid]["WaveFormData"])
                )
                # sample_length is equivalent to dic['RestingECG']['Waveform']['LeadData']['LeadSampleCountTotal']
                if sample_length == 5000:
                    lead_data[
                        lead["LeadData"][leadid]["LeadID"]
                    ] = decode_ekg_muse_to_array(
                        lead["LeadData"][leadid]["WaveFormData"], downsample=0.5
                    )
                elif sample_length == 2500:
                    lead_data[
                        lead["LeadData"][leadid]["LeadID"]
                    ] = decode_ekg_muse_to_array(
                        lead["LeadData"][leadid]["WaveFormData"], downsample=1
                    )
                else:
                    continue
            # ensures all leads have 2500 samples and also passes over the 3 second waveform

        lead_data["III"] = np.array(lead_data["II"]) - np.array(lead_data["I"])
        lead_data["aVR"] = -(np.array(lead_data["I"]) + np.array(lead_data["II"])) / 2
        lead_data["aVF"] = (np.array(lead_data["II"]) + np.array(lead_data["III"])) / 2
        lead_data["aVL"] = (np.array(lead_data["I"]) - np.array(lead_data["III"])) / 2

        lead_data = {k: lead_data[k] for k in lead_order}
        # drops V3R, V4R, and V7 if it was a 15-lead ECG

        # now construct and reshape the array
        # converting the dictionary to an np.array
        temp = []
        for key, value in lead_data.items():
            temp.append(value)

        # transpose to be [time, leads, ]
        ekg_array = np.array(temp).T

        # expand dims to [time, leads, 1]
        ekg_array = np.expand_dims(ekg_array, axis=-1)

        # Here is a check to make sure all the model inputs are the right shape
        #     assert ekg_array.shape == (2500, 12, 1), "ekg_array is shape {} not (2500, 12, 1)".format(ekg_array.shape )

        # filename = '/ekg_waveform_{}_{}.npy'.format(pt_id, requisition_number)
        filename = f"{pt_id}_{AcquisitionDateTime}.npy"

        path_to_output += filename
        # print(path_to_output)
        with open(path_to_output, "wb") as f:
            np.save(f, ekg_array)
        return path_to_output

    except:
        print("error", dic)
        return None


def ekg_batch_run(ekg_list):
    i = 0
    x = 0
    for file in ekg_list:
        try:
            xml_to_np_array_file(file, output_dir)
            i += 1
        except Exception as e:
            # print("file failed: ", file)
            print(file, e)
            x += 1
        if i % 10000 == 0:
            print(f"Succesfully converted {i} EKGs, failed converting {x} EKGs")

In [3]:
def generate_ecg_dataframe_and_npy(df, output_dir="/ekg_waveforms_output/"):
    output_dir = os.getcwd() + output_dir

    from tqdm import tqdm

    from ECGXMLReader import ECGXMLReader

    patientid_list = []
    patientage_list = []
    patient_date_of_birth_list = []
    patient_gender_list = []
    patient_VentricularRate_list = []
    patient_AtrialRate_list = []
    patient_PRInterval_list = []
    patient_QRSDuration_list = []
    patient_QTInterval_list = []
    patient_QTCorrected_list = []
    patient_Paxis_list = []
    patient_Raxis_list = []
    patient_TAxis_list = []
    patient_QRSCount_list = []
    patient_QOnset_list = []
    patient_QOffset_list = []
    patient_POnset_list = []
    patient_POffset_list = []
    patient_TOffset_list = []
    patient_ECGSampleBase_list = []
    patient_ECGSampleExponent_list = []
    patient_QTcFrederica_list = []
    patient_Location_list = []
    patient_LocatioName_list = []
    patient_RoomID_list = []
    patient_acquisitiondate_list = []
    patient_acquisitiontime_list = []
    patient_status_list = []
    patient_acquisitiondevice_list = []
    patient_referringMDLastName_list = []
    patient_AnalysisSoftware_list = []
    patient_acquisitionSoftwareVersion_list = []
    diagnosis_list = []
    original_diagnosis_list = []
    ecg_output_path_list = []
    xml_path_list = []

    for index, row in tqdm(df.iterrows()):

        ecg = ECGXMLReader(row["path"], augmentLeads=True)
        xml_path_list.append(row["path"])
        ### Concatenate dictionary keys self.ECG['RestingECG']['Diagnosis'] into a list
        patientid_list.append(ecg.PatientDemographics["PatientID"])

        try:
            patientage_list.append(ecg.PatientDemographics["PatientAge"])
        except:
            patientage_list.append(np.nan)

        try:
            patient_date_of_birth_list.append(ecg.PatientDemographics["DateofBirth"])
        except:
            patient_date_of_birth_list.append(np.nan)
        try:
            patient_gender_list.append(ecg.PatientDemographics["Gender"])
        except:
            patient_gender_list.append(np.nan)

        try:
            patient_VentricularRate_list.append(
                ecg.RestingECGMeasurements["VentricularRate"]
            )
        except:
            patient_VentricularRate_list.append(np.nan)

        try:
            patient_AtrialRate_list.append(ecg.RestingECGMeasurements["AtrialRate"])
        except:
            patient_AtrialRate_list.append(np.nan)
        try:
            patient_PRInterval_list.append(ecg.RestingECGMeasurements["PRInterval"])
        except:
            patient_PRInterval_list.append(np.nan)
        try:
            patient_QRSDuration_list.append(ecg.RestingECGMeasurements["QRSDuration"])
        except:
            patient_QRSDuration_list.append(np.nan)
        try:
            patient_QTInterval_list.append(ecg.RestingECGMeasurements["QTInterval"])
        except:
            patient_QTInterval_list.append(np.nan)

        try:
            patient_QTCorrected_list.append(ecg.RestingECGMeasurements["QTCorrected"])
        except:
            patient_QTCorrected_list.append(np.nan)
        try:
            patient_Paxis_list.append(ecg.RestingECGMeasurements["PAxis"])
        except:
            patient_Paxis_list.append(np.nan)

        try:
            patient_Raxis_list.append(ecg.RestingECGMeasurements["RAxis"])
        except:
            patient_Raxis_list.append(np.nan)
        try:
            patient_TAxis_list.append(ecg.RestingECGMeasurements["TAxis"])
        except:
            patient_TAxis_list.append(np.nan)

        try:
            patient_QRSCount_list.append(ecg.RestingECGMeasurements["QRSCount"])
        except:
            patient_QRSCount_list.append(np.nan)
        try:
            patient_QOnset_list.append(ecg.RestingECGMeasurements["QOnset"])
        except:
            patient_QOnset_list.append(np.nan)

        try:
            patient_QOffset_list.append(ecg.RestingECGMeasurements["QOffset"])
        except:
            patient_QOffset_list.append(np.nan)
        try:
            patient_POnset_list.append(ecg.RestingECGMeasurements["POnset"])
        except:
            patient_POnset_list.append(np.nan)
        try:
            patient_POffset_list.append(ecg.RestingECGMeasurements["POffset"])
        except:
            patient_POffset_list.append(np.nan)
        try:
            patient_TOffset_list.append(ecg.RestingECGMeasurements["TOffset"])
        except:
            patient_TOffset_list.append(np.nan)

        patient_ECGSampleBase_list.append(ecg.RestingECGMeasurements["ECGSampleBase"])
        patient_ECGSampleExponent_list.append(
            ecg.RestingECGMeasurements["ECGSampleExponent"]
        )
        try:
            patient_QTcFrederica_list.append(ecg.RestingECGMeasurements["QTcFrederica"])
        except:
            patient_QTcFrederica_list.append(np.nan)

        patient_Location_list.append(ecg.TestDemographics["Location"])

        try:
            patient_LocatioName_list.append(ecg.TestDemographics["LocationName"])
        except:
            patient_LocatioName_list.append(np.nan)

        try:
            patient_RoomID_list.append(ecg.TestDemographics["RoomID"])
        except:
            patient_RoomID_list.append("None")
        try:
            patient_acquisitiondate_list.append(ecg.TestDemographics["AcquisitionDate"])
        except:
            patient_acquisitiondate_list.append(np.nan)

        try:
            patient_acquisitiontime_list.append(ecg.TestDemographics["AcquisitionTime"])
        except:
            patient_acquisitiontime_list.append(np.nan)

        patient_status_list.append(ecg.TestDemographics["Status"])

        try:
            patient_acquisitiondevice_list.append(
                ecg.TestDemographics["AcquisitionDevice"]
            )
        except:
            patient_acquisitiondevice_list.append(np.nan)

        try:
            patient_referringMDLastName_list.append(
                ecg.TestDemographics["ReferringMDLastName"]
            )
        except:
            patient_referringMDLastName_list.append("None")
        try:
            patient_AnalysisSoftware_list.append(
                ecg.TestDemographics["AnalysisSoftwareVersion"]
            )
        except:
            patient_AnalysisSoftware_list.append(np.nan)
        try:
            patient_acquisitionSoftwareVersion_list.append(
                ecg.TestDemographics["AcquisitionSoftwareVersion"]
            )
        except:
            patient_acquisitionSoftwareVersion_list.append(np.nan)

        diagnosis = []
        try:
            for key in ecg.Diagnosis["DiagnosisStatement"]:
                # print(key['StmtText'])
                try:
                    diagnosis.append(key["StmtText"])
                except:
                    diagnosis.append(key["ENDSLINE"])

            ##merge items in diagnosis list into a single string
            diagnosis = " ".join(diagnosis)

            diagnosis_list.append(diagnosis)
        except:
            print(ecg.TestDemographics)
            print(ecg.PatientDemographics)
            print(ecg.RestingECGMeasurements)
            print(ecg.PatientDemographics["PatientID"])
            diagnosis_list.append(-1)

        diagnosis = []
        try:
            for key in ecg.OriginalDiagnosis["DiagnosisStatement"]:
                # print(key['StmtText'])
                try:
                    diagnosis.append(key["StmtText"])
                except:
                    diagnosis.append(key["ENDSLINE"])

            ##merge items in diagnosis list into a single string
            diagnosis = " ".join(diagnosis)

            original_diagnosis_list.append(diagnosis)
        except:
            print(ecg.TestDemographics)
            print(ecg.PatientDemographics)
            print(ecg.RestingECGMeasurements)
            print(ecg.PatientDemographics["PatientID"])
            original_diagnosis_list.append(-1)

            # display(ecg.Diagnosis['DiagnosisStatement'])
            # break
        # print(ecg.TestDemographics)
        # print(ecg.PatientDemographics)
        # print(ecg.RestingECGMeasurements)
        # print(ecg.Diagnosis)
        # print(ecg.OriginalDiagnosis)
        ecg_output_path = xml_to_np_array_file(row["path"], output_dir)
        ecg_output_path_list.append(ecg_output_path)
    ##Create dataaframe with the previous lists
    df_output = pd.DataFrame(
        {
            "patientid": patientid_list,
            "age": patientage_list,
            "dob": patient_date_of_birth_list,
            "gender": patient_gender_list,
            "VentricularRate": patient_VentricularRate_list,
            "AtrialRate": patient_AtrialRate_list,
            "PRInterval": patient_PRInterval_list,
            "QRSDuration": patient_QRSDuration_list,
            "QTInterval": patient_QTInterval_list,
            "QTCorrected": patient_QTCorrected_list,
            "PAxis": patient_Paxis_list,
            "RAXis": patient_Raxis_list,
            "TAxis": patient_TAxis_list,
            "QRSCount": patient_QRSCount_list,
            "QOnset": patient_QOnset_list,
            "QOffset": patient_QOffset_list,
            "POnset": patient_POnset_list,
            "POffset": patient_POffset_list,
            "TOffset": patient_TOffset_list,
            "ECGSampleBase": patient_ECGSampleBase_list,
            "ECGSampleExponent": patient_ECGSampleExponent_list,
            "QTcFrederica": patient_QTcFrederica_list,
            "Location": patient_Location_list,
            "LocationName": patient_LocatioName_list,
            "RoomID": patient_RoomID_list,
            "AcquisitionDate": patient_acquisitiondate_list,
            "AcquisitionTime": patient_acquisitiontime_list,
            "Status": patient_status_list,
            "AcquisitionDevice": patient_acquisitiondevice_list,
            "ReferringMDLastName": patient_referringMDLastName_list,
            "AnalysisSoftware": patient_AnalysisSoftware_list,
            "AcquisitionSoftwareVersion": patient_acquisitionSoftwareVersion_list,
            "Diagnosis": diagnosis_list,
            "Original_Diagnosis": original_diagnosis_list,
            "xml_path": xml_path_list,
            "ecg_output_path": ecg_output_path_list,
        }
    )
    return df_output

In [4]:
### List all files in '/media/data1/muse_ge/ecg_retrospective' ending in XML and add them to list
import glob
import os

## Get all fins in directory 'media/data1/muse_ge/ecg_retrospective' ending in .XML
path = "/media/data1/muse_ge/ecg_retrospective"
all_files = glob.glob(os.path.join(path, "*.xml"))
### Create dataframe with all ECG files
df = pd.DataFrame(all_files, columns=["path"])
display(df)

Unnamed: 0,path
0,/media/data1/muse_ge/ecg_retrospective/MUSE_20220712_143132_57000.xml
1,/media/data1/muse_ge/ecg_retrospective/MUSE_20220622_120344_07000.xml
2,/media/data1/muse_ge/ecg_retrospective/MUSE_20220712_143134_90000.xml
3,/media/data1/muse_ge/ecg_retrospective/MUSE_20220622_120345_21000.xml
4,/media/data1/muse_ge/ecg_retrospective/MUSE_20220712_143133_74000.xml
...,...
235612,/media/data1/muse_ge/ecg_retrospective/MUSE_20221003_081514_73000.xml
235613,/media/data1/muse_ge/ecg_retrospective/MUSE_20221003_082444_26000.xml
235614,/media/data1/muse_ge/ecg_retrospective/MUSE_20221003_082445_46000.xml
235615,/media/data1/muse_ge/ecg_retrospective/MUSE_20221003_082500_66000.xml


In [6]:
# df_output = generate_ecg_dataframe_and_npy(df)
# df_output.to_csv("data/20221002_ECG.csv")
df_output = pd.read_csv("data/20221002_ECG_mod_diagnosis.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0.1,Unnamed: 0,patientid,age,dob,gender,VentricularRate,AtrialRate,PRInterval,QRSDuration,QTInterval,QTCorrected,PAxis,RAXis,TAxis,QRSCount,QOnset,QOffset,POnset,POffset,TOffset,ECGSampleBase,ECGSampleExponent,QTcFrederica,Location,LocationName,RoomID,AcquisitionDate,AcquisitionTime,Status,AcquisitionDevice,ReferringMDLastName,AnalysisSoftware,AcquisitionSoftwareVersion,Diagnosis,Original_Diagnosis,xml_path,ecg_output_path
0,0,604777,74.0,07-18-1947,MALE,76.0,76.0,168.0,100.0,400.0,450.0,79.0,31.0,20.0,13.0,220.0,270.0,136.0,189.0,420.0,500,0,432.0,33,3 NORD,61.0,06-22-2022,21:34:54,UNCONFIRMED,MV360,DR.BOUCHARD,243,1.02 SP02,Rythme sinusal avec de rare(s) complexes AV à rythme entrainé sequentiel Criteres de voltage limites d'HVG ou variante de la normale ( Sokolow-Lyon ) Anomalie non-spécifique du segment ST ECG anormal,Rythme sinusal avec de rare(s) complexes AV à rythme entrainé sequentiel Criteres de voltage limites d'HVG ou variante de la normale ( Sokolow-Lyon ) Anomalie non-spécifique du segment ST ECG anormal,/media/data1/muse_ge/ecg_retrospective/MUSE_20220712_143132_57000.xml,/volume/DeepECG/ekg_waveforms_output/0604777_06-22-2022_21-34-54.npy
1,1,799999,81.0,06-21-1940,MALE,60.0,60.0,164.0,82.0,382.0,382.0,44.0,46.0,47.0,9.0,220.0,261.0,138.0,181.0,411.0,500,0,382.0,1,1_CARDIOLOGIE GENERALE,,08-13-2021,11:01:03,CONFIRMED,MV360,Md I.C.M.,243,1.02 SP01,"Rythme sinusal normal Rapport R/S augmenté en V1, considérer rotation horaire ou infarctus postérieur ECG anormal","Rythme sinusal normal Rapport R/S augmenté en V1, considérer rotation horaire ou infarctus postérieur ECG anormal",/media/data1/muse_ge/ecg_retrospective/MUSE_20220622_120344_07000.xml,/volume/DeepECG/ekg_waveforms_output/0799999_08-13-2021_11-01-03.npy


In [9]:
# df_output['ecg_abnormal'] = np.where(df_output['Diagnosis'].str.contains('ECG anormal'), 1, np.where(df_output['Diagnosis'].str.contains('ECG normal'), 0, -1))
# Remove ECG anormal and ECG normal from diagnosis
# df_output['Diagnosis'] = df_output['Diagnosis'].str.replace('ECG anormal', '')
# df_output['Original_Diagnosis'] = df_output['Original_Diagnosis'].str.replace('ECG normal', '')
# df_output['Original_Diagnosis'] = df_output['Diagnosis'].str.replace('ECG anormal', '')
# df_output['Diagnosis'] = df_output['Original_Diagnosis'].str.replace('ECG normal', '')
# df_output.to_csv('data/20221002_ECG_mod_diagnosis.csv')

In [10]:
display(df_output.ecg_abnormal.value_counts())

-1    177085
 0     58448
Name: ecg_abnormal, dtype: int64

In [11]:
df_m = (
    df_output.groupby(["patientid", "AcquisitionDate", "AcquisitionTime"])
    .first()
    .reset_index()
)
display(df_stats(df_output))
display(df_stats(df_m))
## The ECGs can be grouped by patient id, date and time to have a 1 unique row per ECG - this means the filename to save the ECG also needs to have the date and time in it.


***** Shape:  (235533, 38)  *****

+----+----------------------------+--------+----------+----------+
|    | Name                       |   Null |   Unique | Dtypes   |
|----+----------------------------+--------+----------+----------|
|  0 | Unnamed: 0                 |      0 |   235533 | int64    |
|  1 | patientid                  |      0 |    68198 | object   |
|  2 | age                        |    109 |       99 | float64  |
|  3 | dob                        |   1919 |    22444 | object   |
|  4 | gender                     |     49 |        2 | object   |
|  5 | VentricularRate            |     16 |      215 | float64  |
|  6 | AtrialRate                 |   8077 |      336 | float64  |
|  7 | PRInterval                 |  45470 |      241 | float64  |
|  8 | QRSDuration                |     12 |      146 | float64  |
|  9 | QTInterval                 |     13 |      332 | float64  |
| 10 | QTCorrected                |     16 |      615 | float64  |
| 11 | PAxis              

Unnamed: 0.1,Unnamed: 0,patientid,age,dob,gender,VentricularRate,AtrialRate,PRInterval,QRSDuration,QTInterval,QTCorrected,PAxis,RAXis,TAxis,QRSCount,QOnset,QOffset,POnset,POffset,TOffset,ECGSampleBase,ECGSampleExponent,QTcFrederica,Location,LocationName,RoomID,AcquisitionDate,AcquisitionTime,Status,AcquisitionDevice,ReferringMDLastName,AnalysisSoftware,AcquisitionSoftwareVersion,Diagnosis,Original_Diagnosis,xml_path,ecg_output_path,ecg_abnormal
0,0,604777,74.0,07-18-1947,MALE,76.0,76.0,168.0,100.0,400.0,450.0,79.0,31.0,20.0,13.0,220.0,270.0,136.0,189.0,420.0,500,0,432.0,33,3 NORD,61,06-22-2022,21:34:54,UNCONFIRMED,MV360,DR.BOUCHARD,243,1.02 SP02,Rythme sinusal avec de rare(s) complexes AV à rythme entrainé sequentiel Criteres de voltage limites d'HVG ou variante de la normale ( Sokolow-Lyon ) Anomalie non-spécifique du segment ST,Rythme sinusal avec de rare(s) complexes AV à rythme entrainé sequentiel Criteres de voltage limites d'HVG ou variante de la normale ( Sokolow-Lyon ) Anomalie non-spécifique du segment ST,/media/data1/muse_ge/ecg_retrospective/MUSE_20220712_143132_57000.xml,/volume/DeepECG/ekg_waveforms_output/0604777_06-22-2022_21-34-54.npy,-1
1,1,799999,81.0,06-21-1940,MALE,60.0,60.0,164.0,82.0,382.0,382.0,44.0,46.0,47.0,9.0,220.0,261.0,138.0,181.0,411.0,500,0,382.0,1,1_CARDIOLOGIE GENERALE,,08-13-2021,11:01:03,CONFIRMED,MV360,Md I.C.M.,243,1.02 SP01,"Rythme sinusal normal Rapport R/S augmenté en V1, considérer rotation horaire ou infarctus postérieur","Rythme sinusal normal Rapport R/S augmenté en V1, considérer rotation horaire ou infarctus postérieur",/media/data1/muse_ge/ecg_retrospective/MUSE_20220622_120344_07000.xml,/volume/DeepECG/ekg_waveforms_output/0799999_08-13-2021_11-01-03.npy,-1
2,2,609146,38.0,09-30-1983,MALE,185.0,,,78.0,266.0,466.0,,145.0,20.0,31.0,216.0,255.0,,,349.0,500,0,387.0,43,4 NORD,456,06-22-2022,21:20:00,UNCONFIRMED,MV360,,243,1.02 SP02,*** ATTENTION! mauvaise qualité de l'ECG*** Tachycardie supraventriculaire Hypertrophie ventriculaire droite,*** ATTENTION! mauvaise qualité de l'ECG*** Tachycardie supraventriculaire Hypertrophie ventriculaire droite,/media/data1/muse_ge/ecg_retrospective/MUSE_20220712_143134_90000.xml,/volume/DeepECG/ekg_waveforms_output/0609146_06-22-2022_21-20-00.npy,-1
3,3,799999,81.0,06-21-1940,MALE,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,500,0,0.0,1,1_CARDIOLOGIE GENERALE,,08-19-2021,14:15:00,CONFIRMED,MV360,Md I.C.M.,243,1.02 SP01,-1,-1,/media/data1/muse_ge/ecg_retrospective/MUSE_20220622_120345_21000.xml,/volume/DeepECG/ekg_waveforms_output/0799999_08-19-2021_14-15-00.npy,-1
4,4,609083,66.0,07-05-1955,MALE,53.0,53.0,172.0,148.0,452.0,424.0,54.0,-48.0,61.0,8.0,211.0,285.0,125.0,171.0,437.0,500,0,433.0,51,5 COURT-SEJOUR,502-5,06-22-2022,22:51:54,UNCONFIRMED,MV360,,243,1.02 SP02,Bradycardie sinusale Bloc de branche droit complet Hémibloc antérieur gauche *** Bloc bifasiculaire ***,Bradycardie sinusale Bloc de branche droit complet Hémibloc antérieur gauche *** Bloc bifasiculaire ***,/media/data1/muse_ge/ecg_retrospective/MUSE_20220712_143133_74000.xml,/volume/DeepECG/ekg_waveforms_output/0609083_06-22-2022_22-51-54.npy,-1



***** Shape:  (208644, 38)  *****

+----+----------------------------+--------+----------+----------+
|    | Name                       |   Null |   Unique | Dtypes   |
|----+----------------------------+--------+----------+----------|
|  0 | patientid                  |      0 |    68198 | object   |
|  1 | AcquisitionDate            |      0 |      955 | object   |
|  2 | AcquisitionTime            |      0 |    59510 | object   |
|  3 | Unnamed: 0                 |      0 |   208644 | int64    |
|  4 | age                        |     69 |       99 | float64  |
|  5 | dob                        |   1542 |    22444 | object   |
|  6 | gender                     |     36 |        2 | object   |
|  7 | VentricularRate            |     16 |      215 | float64  |
|  8 | AtrialRate                 |   4575 |      336 | float64  |
|  9 | PRInterval                 |  38185 |      241 | float64  |
| 10 | QRSDuration                |     12 |      146 | float64  |
| 11 | QTInterval         

Unnamed: 0.1,patientid,AcquisitionDate,AcquisitionTime,Unnamed: 0,age,dob,gender,VentricularRate,AtrialRate,PRInterval,QRSDuration,QTInterval,QTCorrected,PAxis,RAXis,TAxis,QRSCount,QOnset,QOffset,POnset,POffset,TOffset,ECGSampleBase,ECGSampleExponent,QTcFrederica,Location,LocationName,RoomID,Status,AcquisitionDevice,ReferringMDLastName,AnalysisSoftware,AcquisitionSoftwareVersion,Diagnosis,Original_Diagnosis,xml_path,ecg_output_path,ecg_abnormal
0,74,11-08-2021,06:44:43,87194,74.0,,MALE,37.0,37.0,174.0,90.0,518.0,406.0,57.0,9.0,-4.0,7.0,210.0,255.0,123.0,179.0,469.0,500,0,441.0,22,22_URGENCE MAJEURE,A11,CONFIRMED,MAC55,,239,009C,Bradycardie sinusale marquée,Bradycardie sinusale marquée,/media/data1/muse_ge/ecg_retrospective/MUSE_20220907_233630_79000.xml,/volume/DeepECG/ekg_waveforms_output/0000074_11-08-2021_06-44-43.npy,-1
1,138,10-15-2021,12:13:39,93574,72.0,01-26-1949,FEMALE,62.0,62.0,150.0,86.0,428.0,434.0,84.0,22.0,37.0,11.0,219.0,262.0,144.0,177.0,433.0,500,0,432.0,1,1_CARDIOLOGIE GENERALE,,CONFIRMED,MAC55,Md I.C.M.,239,009C,Rythme sinusal normal,Rythme sinusal normal ECG normal,/media/data1/muse_ge/ecg_retrospective/MUSE_20220910_220003_77000.xml,/volume/DeepECG/ekg_waveforms_output/0000138_10-15-2021_12-13-39.npy,0
2,907,11-17-2021,10:22:22,84445,75.0,03-07-1946,FEMALE,97.0,97.0,,132.0,406.0,515.0,,112.0,-27.0,15.0,232.0,298.0,,,435.0,500,0,476.0,1,1_CARDIOLOGIE GENERALE,,CONFIRMED,MAC55,Md Référant M-V,239,009C,Cardio-stimulateur à demande; interprétation basée sur le rythme spontané Fibrillation auriculaire avec extrasystoles ventriculaires ou extrasystoles avec conduction aberrante Bloc de branche droi...,Cardio-stimulateur à demande; interprétation basée sur le rythme spontané Fibrillation auriculaire avec extrasystoles ventriculaires ou extrasystoles avec conduction aberrante Bloc de branche droi...,/media/data1/muse_ge/ecg_retrospective/MUSE_20220907_220841_82000.xml,/volume/DeepECG/ekg_waveforms_output/0000907_11-17-2021_10-22-22.npy,-1
3,1205,10-15-2021,08:52:31,93704,56.0,08-17-1965,FEMALE,77.0,77.0,148.0,164.0,456.0,516.0,43.0,84.0,19.0,13.0,215.0,297.0,141.0,189.0,443.0,500,0,495.0,2,2_CLINIQUE DES CONGENITAUX,,CONFIRMED,MAC55,Md Référant M-V,239,009C,Rythme sinusal normal Bloc de branche droit complet Anomalie de l'onde T; ischémie inférolatérale possible,Rythme sinusal normal Bloc de branche droit complet Anomalie de l'onde T; ischémie inférolatérale possible,/media/data1/muse_ge/ecg_retrospective/MUSE_20220910_220256_82000.xml,/volume/DeepECG/ekg_waveforms_output/0001205_10-15-2021_08-52-31.npy,-1
4,1207,10-08-2021,12:42:17,95297,75.0,06-16-1946,MALE,57.0,57.0,248.0,110.0,438.0,426.0,61.0,32.0,66.0,9.0,215.0,270.0,91.0,159.0,434.0,500,0,430.0,1,1_CARDIOLOGIE GENERALE,,CONFIRMED,MAC55,Md Référant M-V,239,009C,Bradycardie sinusale avec bloc A-V du premier degré par ailleurs,Bradycardie sinusale avec bloc A-V du premier degré ECG normal par ailleurs,/media/data1/muse_ge/ecg_retrospective/MUSE_20220910_223734_51000.xml,/volume/DeepECG/ekg_waveforms_output/0001207_10-08-2021_12-42-17.npy,0


In [12]:
# output_dir = os.getcwd() + '/ekg_waveforms_output/'
# ekg_batch_run(df['path'][0:1])

In [17]:
## Display top 1000 most frequent df_output['Diagnosis]
display(df_output["Diagnosis"].value_counts()[0:1000])

Rythme sinusal normal                                                                                                                                             34361
Bradycardie sinusale  par ailleurs                                                                                                                                 9350
-1                                                                                                                                                                 7532
Rythme sinusal normal Anomalie non-spécifique de l'onde T                                                                                                          2828
Rythme sinusal normal avec arythmie sinusale                                                                                                                       2535
                                                                                                                                                                