In [None]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("max_colwidth", 200)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


import os

os.environ["R_HOME"] = "/root/miniconda3/envs/R/lib/R"

import rpy2.robjects as objects
from rpy2.robjects.packages import importr

base = importr("base")
r_pROC = importr("pROC")
base._libPaths()[0]


def df_stats(df):
    from tabulate import tabulate

    print("\n***** Shape: ", df.shape, " *****\n")

    columns_list = df.columns.values.tolist()
    isnull_list = df.isnull().sum().values.tolist()
    isunique_list = df.nunique().values.tolist()
    dtypes_list = df.dtypes.tolist()

    list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
    df_stat_val = pd.DataFrame(
        list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"]
    )
    print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

### Convert ECG to PARQUET

In [None]:
df_ecg = pd.read_csv(
    "/media/data1/ravram/DeepECG/ekg_waveforms_output/df_xml_2023_03_14_n_1633856.csv"
)
df_ecg = df_ecg.drop(columns=["Unnamed: 0"])

In [None]:
df_ecg["RestingECG_PatientDemographics_PatientID"] = df_ecg[
    "RestingECG_PatientDemographics_PatientID"
].astype(str)
df_ecg["RestingECG_TestDemographics_AnalysisSoftwareVersion"] = df_ecg[
    "RestingECG_PatientDemographics_PatientID"
].astype(str)
df_ecg["RestingECG_Order_ExtraADTData1"] = df_ecg[
    "RestingECG_Order_ExtraADTData1"
].astype(str)
df_ecg["RestingECG_ExtraQuestions_ExtraQuestion_0_Answer"] = df_ecg[
    "RestingECG_ExtraQuestions_ExtraQuestion_0_Answer"
].astype(str)

In [None]:
### If column name contains RestingECG_Order  then convert column values to string:
for col in df_ecg.columns:
    if "RestingECG_Order" in col:
        print(col)
        df_ecg[col] = df_ecg[col].astype(str)

In [None]:
### If column name contains RestingECG_Order  then convert column values to string:
for col in df_ecg.columns:
    if "RestingECG_ExtraQuestions_" in col:
        print(col)
        df_ecg[col] = df_ecg[col].astype(str)

In [None]:
# df_ecg.to_parquet(
#    "/media/data1/ravram/DeepECG/ekg_waveforms_output/df_xml_2023_03_14_n_1633856.parquet"
# )

## CHECK ECGs from df_LQTS

In [None]:
df_LQTS = pd.read_csv("data/LQTS/DeepECG-LQT-20230210.csv", delim_whitespace=True)
display(df_LQTS.head())

In [None]:
df_ecg = pd.read_parquet(
    "/media/data1/ravram/DeepECG/ekg_waveforms_output/df_xml_2023_03_14_n_1633856.parquet"
)

In [None]:
# Drop rows where RestingECG_PatientDemographics_PatientID is not an int
df_ecg = df_ecg[df_ecg["RestingECG_PatientDemographics_PatientID"].str.isdigit()]
df_ecg["RestingECG_PatientDemographics_PatientID"] = pd.to_numeric(
    df_ecg["RestingECG_PatientDemographics_PatientID"], errors="coerce"
)
df_ecg["RestingECG_PatientDemographics_PatientID"] = (
    df_ecg["RestingECG_PatientDemographics_PatientID"].fillna(0).astype(int)
)

##df_ecg = df_ecg['RestingECG_PatientDemographics_PatientID'].astype(int, errors='coerce')

In [None]:
## Check with of df_LQTS['Dossier'] is in df_ecg['RestingECG_PatientDemographics_PatientID']
df_LQTS["Dossier"] = df_LQTS["Dossier"].astype(int)
## Check if df_LQTS['Dossier'] is in df_ecg['RestingECG_PatientDemographics_PatientID']
df_LQTS["Dossier"] = df_LQTS["Dossier"].astype(int)
df_ecg["RestingECG_PatientDemographics_PatientID"] = df_ecg[
    "RestingECG_PatientDemographics_PatientID"
].astype(int)

In [None]:
df_LQTS["Dossier_exists_in_ecg"] = df_LQTS["Dossier"].isin(
    df_ecg["RestingECG_PatientDemographics_PatientID"]
)
display(df_LQTS["Dossier_exists_in_ecg"].value_counts())

In [None]:
df_ecg["Dossier_exists_in_ecg"] = df_ecg[
    "RestingECG_PatientDemographics_PatientID"
].isin(df_LQTS["Dossier"])

In [None]:
## Check if file at df_ecg['xml_path'] exists if df_ecg['Dossier_exists_in_ecg'] == True
# df_ecg = df_ecg.loc[df_ecg['Dossier_exists_in_ecg']==True]
# df_ecg['xml_path_exists'] = df_ecg['xml_path'].apply(lambda x: os.path.exists(x))
# df_ecg['xml_path_exists'] = df_ecg['xml_path_exists'].astype(int)

In [None]:
#
# df_ecg.loc[df_ecg['Dossier_exists_in_ecg']==True].to_csv('../lqts/data/ecg_lqts_rafik.csv')
# display(df_ecg['xml_path_exists'].value_counts())

df_ecg = pd.read_csv("../lqts/data/df_lqts_merged.csv")
display(df_ecg.head(n=2))

In [None]:
## Write a function to copy ECGs from df_ecg['xml_path'] to local folder and make 'folder' if it doesn't exist


def copy_ecg_to_local_folder(df_ecg, output_folder="lqts_ecg/"):
    import shutil

    from tqdm import tqdm

    # If output_folder doesn't exist, create it
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for index, row in tqdm(df_ecg.iterrows()):
        if row["Dossier_exists_in_ecg"] == True:
            print(row["xml_path"])
            print(row["lqts_type"])
            ## Copy row['xml_path'] to output_folder
            shutil.copy(row["xml_path"], output_folder)

In [None]:
copy_ecg_to_local_folder(df_ecg[0:10])

In [None]:
import os
import shutil

import matplotlib.pyplot as plt
## Load the first df_ecg['npy_path'] and detemrine the shape
import numpy as np
import pandas as pd
from tqdm import tqdm

ecg_array = np.load(df_ecg["npy_path"].iloc[0])

In [None]:
display(ecg_array.shape)