## Preprocess data to get training and testing data

I,II,III,aVL,aVR,aVF,V1–V6

In [1]:
# Import package
import time
import numpy as np
import wfdb
import ast
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pylab import mpl
from scipy.fftpack import fft, ifft 
from scipy import signal
# from biosppy.signals import ecg
import neurokit2 as nk
from sklearn import *
from collections import OrderedDict

In [2]:
#Set the read file path
path = '/global/D1/homes/jayao/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.2/ptbxl/'

X = np.load(path + 'raw100_sex0.npy', allow_pickle=True)
sampling_rate = 100

In [3]:
df = pd.read_csv(path+'ptbxl_database.csv')

In [4]:
# Read the file and convert tags
Y = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

In [5]:
X.shape

(11356, 1000, 12)

In [6]:
# Get diagnostic information in scp_statements.csv
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)

In [7]:
agg_df = agg_df[agg_df.diagnostic == 1]

In [8]:
def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

In [9]:
#Add diagnostic information
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

In [10]:
# Filter dataframe Y to include only rows where sex=0
Y_sex0 = Y[Y['sex'] == 0]

In [11]:
Y_sex0.shape

(11356, 44)

In [12]:
# Split data into train and test
test_fold = 10
# # Train
X_train = X[(Y_sex0.strat_fold <= 8)]
y_train = Y_sex0[(Y_sex0.strat_fold <= 8)].diagnostic_superclass
# # Test
X_test = X[(Y_sex0.strat_fold >8)]
y_test = Y_sex0[(Y_sex0.strat_fold > 8)].diagnostic_superclass


print(X_train.shape, y_train.shape)
print(X_test.shape,  y_test.shape)

(9091, 1000, 12) (9091,)
(2265, 1000, 12) (2265,)


In [13]:
save_path = '/global/D1/homes/jayao/XAI-Based-ECG-Diagnostics-main/data/sex0/'

np.save(save_path+'x_train.npy', X_train)
np.save(save_path+'y_train.npy', np.array(y_train))
np.save(save_path+'x_test.npy', X_test)
np.save(save_path+'y_test.npy', np.array(y_test))

## ECG SHaP starts

In [1]:
from tensorflow.keras import layers, optimizers, losses, metrics, activations, regularizers, callbacks
from keras.models import Model
import numpy as np
import pandas as pd
from tensorflow.keras.layers import LSTM

In [2]:
# path = "/global/D1/homes/jayao/XAI-Based-ECG-Diagnostics-main/data/sex0/"
path = 'D:\\Test Jupyter\\ECG-Classfier-main\\code\\Test ECG\\sex0\\'


x_train = np.load(path + 'x_train.npy')
y_train = np.load(path + 'y_train.npy', allow_pickle=True)
x_test  = np.load(path + 'x_test.npy')
y_test  = np.load(path + 'y_test.npy', allow_pickle=True)
print(x_train.shape)

(9091, 1000, 12)


In [3]:
x_train = x_train.transpose(0, 2, 1)            # transpose working correctly
x_test  = x_test.transpose(0, 2, 1)
print(x_train.shape)
print(x_test.shape)

(9091, 12, 1000)
(2265, 12, 1000)


In [4]:
x_train = x_train.reshape(9091, 12, 1000, 1)   # Add another channel
x_test  = x_test.reshape(2265, 12, 1000, 1)

In [5]:
print("x_train :", x_train.shape)
print("y_train :", y_train.shape)
print("x_test  :", x_test.shape)
print("y_test  :", y_test.shape)
print('Data loaded')

# Old OUTPUTS:
# (19601, 1000, 12)
# (19601, 12, 1000)
# x_train : (19601, 12, 1000, 1)
# y_train : (19601,)
# x_test  : (2198, 12, 1000, 1)
# y_test  : (2198,)
# Data loaded

x_train : (9091, 12, 1000, 1)
y_train : (9091,)
x_test  : (2265, 12, 1000, 1)
y_test  : (2265,)
Data loaded


In [6]:
x_test.shape

(2265, 12, 1000, 1)

In [7]:

from sklearn.preprocessing import MultiLabelBinarizer
# Convert multi-label target labels to one-hot encoded matrix
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)
print("Classes:", mlb.classes_)

Classes: ['CD' 'HYP' 'MI' 'NORM' 'STTC']


In [8]:
value_at_index = y_train[6666]
print(value_at_index)

[1 0 1 0 0]


In [9]:
y_train.shape

(9091, 5)

In [10]:
y_test.shape

(2265, 5)

In [11]:
# # ST-CNN
# # Main Version
# input = layers.Input(shape=(12, 1000, 1))

# X = layers.Conv2D(filters=32, kernel_size=(1, 5))(input)
# X = layers.BatchNormalization()(X)
# X = layers.ReLU()(X)
# X = layers.MaxPooling2D(pool_size=(1, 2), strides=1)(X)

# convC1 = layers.Conv2D(filters=64, kernel_size=(1, 7))(X)

# X = layers.Conv2D(filters=32, kernel_size=(1, 5))(X)
# X = layers.BatchNormalization()(X)
# X = layers.ReLU()(X)
# X = layers.MaxPooling2D(pool_size=(1, 4), strides=1)(X)

# convC2 = layers.Conv2D(filters=64, kernel_size=(1, 6))(convC1)

# X = layers.Conv2D(filters=64, kernel_size=(1, 5))(X)
# X = layers.BatchNormalization()(X)
# X = layers.Add()([convC2, X])           # skip Connection
# X = layers.ReLU()(X)
# X = layers.MaxPooling2D(pool_size=(1, 2), strides=1)(X)

# convE1 = layers.Conv2D(filters=32, kernel_size=(1, 4))(X)

# X = layers.Conv2D(filters=64, kernel_size=(1, 3))(X)
# X = layers.BatchNormalization()(X)
# X = layers.ReLU()(X)
# X = layers.MaxPooling2D(pool_size=(1, 4), strides=1)(X)

# convE2 = layers.Conv2D(filters=64, kernel_size=(1, 5))(convE1)

# X = layers.Conv2D(filters=64, kernel_size=(1, 3))(X)
# X = layers.BatchNormalization()(X)
# X = layers.Add()([convE2, X])         # skip Connection
# X = layers.ReLU()(X)
# X = layers.MaxPooling2D(pool_size=(1, 2), strides=1)(X)
# print('Added 5 layers for temporal analysis')

# X = layers.Conv2D(filters=64, kernel_size=(12, 1))(X)
# X = layers.BatchNormalization()(X)
# X = layers.ReLU()(X)
# X = layers.GlobalAveragePooling2D()(X)
# print('Added 1 layer for spatial Analysis')

# X = layers.Flatten()(X)
# print(X.shape)

# X = layers.Dense(units=128, kernel_regularizer=regularizers.L2(0.005))(X)
# X = layers.BatchNormalization()(X)
# X = layers.ReLU()(X)
# X = layers.Dropout(rate=0.1)(X)

# X = layers.Dense(units=64, kernel_regularizer=regularizers.L2(0.009))(X)
# X = layers.BatchNormalization()(X)
# X = layers.ReLU()(X)
# X = layers.Dropout(rate=0.15)(X)
# print('Added 2 fully connected layers')

# output = layers.Dense(5, activation='sigmoid')(X)
# model = Model(inputs=input, outputs=output)
# print(model.summary())

Added 5 layers for temporal analysis
Added 1 layer for spatial Analysis
(None, 64)
Added 2 fully connected layers


None


Hyper parameter tuning:
EarlyStopping monitors a specified metric, here: "val_loss"
If the val_loss does not improve for a certain number of epochs defined by patience (in this case, 6 epochs), training is stopped early.
The restore_best_weights=True argument ensures that the weights of the model are restored to the best weights when training stopped.

Learning Rate Reduction:
ReduceLROnPlateau monitors the validation loss ("val_loss").
If the validation loss does not improve for a certain number of epochs defined by patience (in this case, 3 epochs), the learning rate is reduced by a factor defined by factor.

In [12]:
#Enhanced ST-CNN
from tensorflow.keras import layers, regularizers
from tensorflow.keras.models import Model

# Main Version
input = layers.Input(shape=(12, 1000, 1))

X = layers.Conv2D(filters=32, kernel_size=(1, 5), padding='same')(input)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.MaxPooling2D(pool_size=(1, 2), strides=1, padding='same')(X)

convC1 = layers.Conv2D(filters=64, kernel_size=(1, 7), padding='same')(X)

X = layers.Conv2D(filters=32, kernel_size=(1, 5), padding='same')(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.MaxPooling2D(pool_size=(1, 4), strides=1, padding='same')(X)

convC2 = layers.Conv2D(filters=64, kernel_size=(1, 6), padding='same')(convC1)

X = layers.Conv2D(filters=64, kernel_size=(1, 5), padding='same')(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
residual_1 = layers.Add()([convC2, X])           # skip Connection
X = layers.ReLU()(residual_1)
X = layers.MaxPooling2D(pool_size=(1, 2), strides=1, padding='same')(X)

convE1 = layers.Conv2D(filters=32, kernel_size=(1, 4), padding='same')(X)

X = layers.Conv2D(filters=64, kernel_size=(1, 3), padding='same')(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.MaxPooling2D(pool_size=(1, 4), strides=1, padding='same')(X)

convE2 = layers.Conv2D(filters=64, kernel_size=(1, 5), padding='same')(convE1)

X = layers.Conv2D(filters=64, kernel_size=(1, 3), padding='same')(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
residual_2 = layers.Add()([convE2, X])         # skip Connection
X = layers.ReLU()(residual_2)
X = layers.MaxPooling2D(pool_size=(1, 2), strides=1, padding='same')(X)
print('Added 5 layers for temporal analysis')

# Spatial Analysis
X = layers.Conv2D(filters=64, kernel_size=(12, 1), padding='same')(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.GlobalAveragePooling2D()(X)
print('Added 1 layer for spatial Analysis')

# Fully Connected Layers
X = layers.Flatten()(X)
X = layers.Dense(units=128, kernel_regularizer=regularizers.L2(0.005))(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.Dropout(rate=0.3)(X)

X = layers.Dense(units=64, kernel_regularizer=regularizers.L2(0.009))(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.Dropout(rate=0.3)(X)
print('Added 2 fully connected layers')

# Output Layer
output = layers.Dense(5, activation='sigmoid')(X)

# Define the model
model = Model(inputs=input, outputs=output)
print(model.summary())


Added 5 layers for temporal analysis
Added 1 layer for spatial Analysis
Added 2 fully connected layers


None


In [13]:
# Source: https://keras.io/api/callbacks/
# Source: https://towardsdatascience.com/checkpointing-deep-learning-models-in-keras-a652570b8de6

early    = callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True)
reducelr = callbacks.ReduceLROnPlateau(monitor="val_loss", patience=3)
callback = [early, reducelr]

# # ST-CNN
# model.compile(optimizer = optimizers.Adam(learning_rate=0.005),
#Enhanced ST-CNN
model.compile(optimizer = optimizers.Adam(learning_rate=0.001),
              loss = losses.BinaryCrossentropy(),
              metrics = [metrics.BinaryAccuracy(), metrics.AUC(curve='ROC', multi_label=True)])

history = model.fit(x_train, y_train, validation_split=0.12, epochs=20, batch_size=64, callbacks=callback)

Epoch 1/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1344s[0m 11s/step - auc: 0.6746 - binary_accuracy: 0.6515 - loss: 1.5701 - val_auc: 0.5802 - val_binary_accuracy: 0.7358 - val_loss: 0.9936 - learning_rate: 0.0010
Epoch 2/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1203s[0m 10s/step - auc: 0.8321 - binary_accuracy: 0.8211 - loss: 0.7289 - val_auc: 0.7113 - val_binary_accuracy: 0.7357 - val_loss: 0.7168 - learning_rate: 0.0010
Epoch 3/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1328s[0m 11s/step - auc: 0.8646 - binary_accuracy: 0.8443 - loss: 0.4988 - val_auc: 0.7925 - val_binary_accuracy: 0.7690 - val_loss: 0.5845 - learning_rate: 0.0010
Epoch 4/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1490s[0m 12s/step - auc: 0.8884 - binary_accuracy: 0.8597 - loss: 0.4040 - val_auc: 0.8690 - val_binary_accuracy: 0.8183 - val_loss: 0.4722 - learning_rate: 0.0010
Epoch 5/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [14]:
# save_path = "/global/D1/homes/jayao/XAI-Based-ECG-Diagnostics-main/model/"
path = 'D:\\Test Jupyter\\ECG-Classfier-main\\code\\Test ECG\\sex0\\'
# model.save(save_path + "ST-CNN-5_final1.h5")
model.save(path + "ST-CNN-5_final1_new_sex0.h5")



In [41]:
# from tensorflow.keras.models import load_model
# model = load_model(r'/global/D1/homes/jayao/XAI-Based-ECG-Diagnostics-main/model/ST-CNN-5_final1_sex0.h5')

In [14]:
# #ST-CNN
# y_pred_train = model.predict(x_train)
# y_pred_test  = model.predict(x_test)

# from sklearn.metrics import classification_report, precision_recall_curve, f1_score, roc_auc_score, accuracy_score, auc
# import numpy as np

# def sklearn_metrics(y_true, y_pred, mlb):
#     y_bin = np.copy(y_pred)
#     y_bin[y_bin >= 0.5] = 1
#     y_bin[y_bin < 0.5]  = 0

# #     print("y_train shape:", y_true.shape)
# # p   print("y_test shape :", y_pred.shape)


#     # Compute area under precision-Recall curve
#     auc_sum = 0
#     for i in range(y_true.shape[1]):
#         precision, recall, thresholds = precision_recall_curve(y_true[:, i], y_pred[:, i])
#         auc_sum += auc(recall, precision)

#     print("Accuracy        : {:.2f}".format(accuracy_score(y_true.flatten(), y_bin.flatten()) * 100))
#     print("Macro AUC score : {:.2f}".format(roc_auc_score(y_true, y_pred, average='macro') * 100))
#     print('AUROC           : {:.2f}'.format((auc_sum / y_true.shape[1]) * 100))
#     print("Micro F1 score  : {:.2f}".format(f1_score(y_true, y_bin, average='micro') * 100))

#     # Convert binary predictions back to class labels using MultiLabelBinarizer
#     predicted_classes = mlb.inverse_transform(y_bin)

#     # Use a set to accumulate all distinct classes
#     distinct_classes = set()

#     # Iterate over predicted classes and add them to the set
#     for classes in predicted_classes:
#         distinct_classes.update(classes)

#     # Convert the set of distinct classes to a sorted list
#     class_names = sorted(list(distinct_classes))

#     # Print classification report for each class
#     print("\nClassification Report:")
#     print(classification_report(y_true, y_bin, target_names=class_names))

# # Assuming mlb is the MultiLabelBinarizer used for transforming the labels
# sklearn_metrics(y_test, y_pred_test, mlb)


[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 637ms/step
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 640ms/step
Accuracy        : 89.13
Macro AUC score : 92.85
AUROC           : 81.21
Micro F1 score  : 77.95

Classification Report:
              precision    recall  f1-score   support

          CD       0.81      0.71      0.76       532
         HYP       0.70      0.36      0.47       294
          MI       0.83      0.75      0.79       608
        NORM       0.83      0.92      0.87      1001
        STTC       0.78      0.67      0.72       473

   micro avg       0.81      0.75      0.78      2908
   macro avg       0.79      0.68      0.72      2908
weighted avg       0.81      0.75      0.77      2908
 samples avg       0.79      0.76      0.76      2908



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
#Enhanced ST-CNN
y_pred_train = model.predict(x_train)
y_pred_test  = model.predict(x_test)

from sklearn.metrics import classification_report, precision_recall_curve, f1_score, roc_auc_score, accuracy_score, auc
import numpy as np

def sklearn_metrics(y_true, y_pred, mlb):
    y_bin = np.copy(y_pred)
    y_bin[y_bin >= 0.5] = 1
    y_bin[y_bin < 0.5]  = 0

#     print("y_train shape:", y_true.shape)
# p   print("y_test shape :", y_pred.shape)


    # Compute area under precision-Recall curve
    auc_sum = 0
    for i in range(y_true.shape[1]):
        precision, recall, thresholds = precision_recall_curve(y_true[:, i], y_pred[:, i])
        auc_sum += auc(recall, precision)

    print("Accuracy        : {:.2f}".format(accuracy_score(y_true.flatten(), y_bin.flatten()) * 100))
    print("Macro AUC score : {:.2f}".format(roc_auc_score(y_true, y_pred, average='macro') * 100))
    print('AUROC           : {:.2f}'.format((auc_sum / y_true.shape[1]) * 100))
    print("Micro F1 score  : {:.2f}".format(f1_score(y_true, y_bin, average='micro') * 100))

    # Convert binary predictions back to class labels using MultiLabelBinarizer
    predicted_classes = mlb.inverse_transform(y_bin)

    # Use a set to accumulate all distinct classes
    distinct_classes = set()

    # Iterate over predicted classes and add them to the set
    for classes in predicted_classes:
        distinct_classes.update(classes)

    # Convert the set of distinct classes to a sorted list
    class_names = sorted(list(distinct_classes))

    # Print classification report for each class
    print("\nClassification Report:")
    print(classification_report(y_true, y_bin, target_names=class_names))

# Assuming mlb is the MultiLabelBinarizer used for transforming the labels
sklearn_metrics(y_test, y_pred_test, mlb)


[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 867ms/step
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 862ms/step
Accuracy        : 89.13
Macro AUC score : 93.22
AUROC           : 82.38
Micro F1 score  : 78.30

Classification Report:
              precision    recall  f1-score   support

          CD       0.78      0.76      0.77       532
         HYP       0.73      0.42      0.54       294
          MI       0.83      0.74      0.78       608
        NORM       0.83      0.90      0.87      1001
        STTC       0.75      0.73      0.74       473

   micro avg       0.80      0.76      0.78      2908
   macro avg       0.79      0.71      0.74      2908
weighted avg       0.80      0.76      0.78      2908
 samples avg       0.79      0.77      0.76      2908



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
