In [None]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 1000)
pd.set_option("display.width", 1000)
pd.set_option("max_colwidth", 1000)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


def df_stats(df):
    from tabulate import tabulate

    print("\n***** Shape: ", df.shape, " *****\n")

    columns_list = df.columns.values.tolist()
    isnull_list = df.isnull().sum().values.tolist()
    isunique_list = df.nunique().values.tolist()
    dtypes_list = df.dtypes.tolist()

    list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
    df_stat_val = pd.DataFrame(
        list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"]
    )
    print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

In [None]:
df_output = pd.read_csv("data/20221002_ECG_mod_diagnosis.csv")
#Rename ecg_output_path to npy_path
df_output = df_output.rename(columns={"ecg_output_path": "npy_path"})
df_output = df_output.rename(columns={"Original_Diagnosis": "original_diagnosis"})
df_output = df_output.rename(columns={"patientid": "RestingECG_PatientDemographics_PatientID"})
df_output = df_output.rename(columns={"AcquisitionDate": "RestingECG_TestDemographics_AcquisitionDate"})
df_output = df_output.rename(columns={"AcquisitionTime": "RestingECG_TestDemographics_AcquisitionTime"})
display(df_output.head(n=5))

In [None]:
df_parquet = pd.read_parquet('/media/data1/muse_ge/ECG_ad202207_1453937_cat_labels_MUSE_vs_CARDIOLOGIST_v1.2.parquet')

In [None]:
filtered_df = df_parquet[df_parquet['xml_path'].str.contains('MUSE_20221212_155611_05000.xml')]
display(filtered_df.head(n=5))


In [None]:
from utils import plot_from_parquet
plot_from_parquet.plot_from_parquet(
    df_parquet_filtered,
    index=100,
    diagnosis_column="diagnosis",
    save=False,
    out_dir=".",
)

In [None]:
## Restrict df_output to 1000 most frequent diagnoses
df_output_m = df_output.groupby("Diagnosis").filter(lambda x: len(x) > 10)
display(len(df_output_m.Diagnosis.value_counts()))
## Randomly sample 2 of df_output['Diagnosis]
df_output_sampled = (
    df_output_m.groupby("Diagnosis")
    .apply(lambda x: x.sample(2, random_state=1))
    .reset_index(drop=True)
)
display(df_output_sampled.Diagnosis.value_counts())
#df_output_sampled.to_csv("data/20221002_ECG_mod_diagnosis_sampled_3600.csv")

In [None]:
import pandas as pd
from utils import plot_ecg_from_xml
from multiprocessing import Pool


df_parquet = pd.read_parquet('/media/data1/muse_ge/ECG_ad202207_1453937_cat_labels_MUSE_vs_CARDIOLOGIST_v1.2.parquet')
def process_xml_path(xml_path):
    result = plot_ecg_from_xml.plot_ecg_from_xml(xml_path, out_dir="ecg_png_parquet/", title="", save=True, anonymize=False, width=1250)

# Use multiprocessing Pool to parallelize the processing with 8 processes
with Pool(processes=8) as pool:
    results = pool.map(process_xml_path, df_parquet.xml_path)

# Create a DataFrame with png_path and xml_path
df_result = pd.DataFrame(results)

display(df_result)


In [None]:
#df_result.to_csv('data/df_ecg_parquet.csv')

In [None]:
df_ecg = pd.read_csv('data/df_ecg_parquet.csv')

# Function to replace path with xml_path filename and add .png extension
def replace_path(row):
    root_path = '/media/data1/ravram/DeepECG/ecg_png_parquet/'
    xml_filename = row['xml_path'].split('/')[-1].replace('.xml', '.xml.png')
    return root_path + xml_filename

df_ecg['path'] = df_ecg.apply(replace_path, axis=1)


In [None]:
df_ecg.to_csv('data/df_ecg_parquet_with_path_2004-2022.csv')


In [None]:
import os

# Print for the first 100 rows if the path exists or not
for index, row in df_ecg.head(100).iterrows():
    path_exists = os.path.exists(row['path'])
    print(f"Path: {row['path']} Exists: {path_exists}")
