## Preprocess data to get training and testing data

In [1]:
# Import package
import time
import numpy as np
import wfdb
import ast
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pylab import mpl
from scipy.fftpack import fft, ifft 
from scipy import signal
# from biosppy.signals import ecg
import neurokit2 as nk
from sklearn import *
from collections import OrderedDict

In [2]:
#Set the read file path
path = '/global/D1/homes/jayao/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.2/ptbxl/'

X = np.load(path + 'raw100.npy', allow_pickle=True)
sampling_rate = 100

In [3]:
X.shape

(21801, 1000, 12)

In [4]:
import numpy as np

# Extracting values from the third column (Lead 3)
X2 = X[:, :, 2]


In [5]:
# Read the file and convert tags
Y = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

In [6]:
X2.shape

(21801, 1000)

In [7]:
# Get diagnostic information in scp_statements.csv
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)

In [8]:
agg_df = agg_df[agg_df.diagnostic == 1]

In [9]:
def diagnostic_class(scp):
    res = set()
    for k in scp.keys():
        if k in agg_df.index:
            res.add(agg_df.loc[k].diagnostic_class)
    return list(res)

In [10]:
def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

In [11]:
Y['scp_classes'] = Y.scp_codes.apply(diagnostic_class)

In [12]:
Z = pd.DataFrame(0, index=Y.index, columns=['NORM', 'MI', 'STTC', 'CD', 'HYP'], dtype='int')
for i in Z.index:
    for k in Y.loc[i].scp_classes:
        Z.loc[i, k] = 1

Z

Unnamed: 0_level_0,NORM,MI,STTC,CD,HYP
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,0,0,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
5,1,0,0,0,0
...,...,...,...,...,...
21833,0,0,1,0,0
21834,1,0,0,0,0
21835,0,0,1,0,0
21836,1,0,0,0,0


In [13]:
#Add diagnostic information
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

In [14]:
Y.diagnostic_superclass.value_counts()

diagnostic_superclass
[NORM]                 9072
[MI]                   2532
[STTC]                 2401
[CD]                   1708
[CD, MI]               1300
[HYP, STTC]             781
[STTC, MI]              600
[HYP]                   535
[CD, STTC]              471
[NORM, CD]              407
[]                      405
[HYP, STTC, MI]         361
[HYP, CD]               300
[CD, STTC, MI]          223
[HYP, MI]               183
[HYP, CD, MI]           117
[HYP, STTC, CD]         109
[CD, HYP, STTC, MI]      99
[HYP, CD, STTC]          98
[HYP, CD, STTC, MI]      54
[NORM, STTC]             28
[NORM, CD, STTC]          5
[STTC, HYP, CD]           4
[STTC, HYP, CD, MI]       3
[NORM, HYP, CD]           2
[NORM, HYP]               2
[NORM, HYP, CD, MI]       1
Name: count, dtype: int64

In [15]:
unique_values = Y['diagnostic_superclass'].apply(tuple).unique()
print(unique_values)


[('NORM',) ('MI',) () ('STTC',) ('HYP',) ('CD',) ('STTC', 'MI')
 ('HYP', 'CD') ('CD', 'MI') ('CD', 'STTC') ('HYP', 'MI')
 ('HYP', 'STTC', 'MI') ('HYP', 'CD', 'STTC', 'MI') ('HYP', 'STTC')
 ('NORM', 'CD') ('CD', 'STTC', 'MI') ('HYP', 'STTC', 'CD')
 ('CD', 'HYP', 'STTC', 'MI') ('HYP', 'CD', 'STTC')
 ('STTC', 'HYP', 'CD', 'MI') ('NORM', 'STTC') ('HYP', 'CD', 'MI')
 ('NORM', 'CD', 'STTC') ('NORM', 'HYP', 'CD') ('NORM', 'HYP')
 ('STTC', 'HYP', 'CD') ('NORM', 'HYP', 'CD', 'MI')]


In [16]:
# Split data into train and test
test_fold = 10
# # Train
X_train = X2[(Y.strat_fold <= 8)]
# y_train = Z[Y.strat_fold <= 8]
y_train = Y[(Y.strat_fold <= 8)].diagnostic_superclass
# # Test
X_test = X2[(Y.strat_fold >8)]
# y_test = Z[Y.strat_fold > 8]
y_test = Y[(Y.strat_fold > 8)].diagnostic_superclass


print(X_train.shape, y_train.shape)
print(X_test.shape,  y_test.shape)

(17420, 1000) (17420,)
(4381, 1000) (4381,)


In [17]:
save_path = '/global/D1/homes/jayao/XAI-Based-ECG-Diagnostics-main/data/lead3/'

np.save(save_path+'x_train.npy', X_train)
np.save(save_path+'y_train.npy', np.array(y_train))
np.save(save_path+'x_test.npy', X_test)
np.save(save_path+'y_test.npy', np.array(y_test))

## ECG SHaP starts

In [2]:
from tensorflow.keras import layers, optimizers, losses, metrics, activations, regularizers, callbacks
from keras.models import Model
import numpy as np
import pandas as pd
from tensorflow.keras.layers import LSTM

2024-02-21 15:52:22.788005: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-21 15:52:22.788055: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-21 15:52:22.789021: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-21 15:52:22.869892: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
path = "/global/D1/homes/jayao/XAI-Based-ECG-Diagnostics-main/data/lead3/"
x_train = np.load(path + 'x_train.npy')
y_train = np.load(path + 'y_train.npy', allow_pickle=True)
x_test  = np.load(path + 'x_test.npy')
y_test  = np.load(path + 'y_test.npy', allow_pickle=True)
print(x_train.shape)
print(x_test.shape)

(17420, 1000)
(4381, 1000)


In [4]:
x_train = x_train.reshape((17420, 1000, 1))
x_test = x_test.reshape((4381, 1000, 1))
print(x_train.shape)
print(x_test.shape)

(17420, 1000, 1)
(4381, 1000, 1)


In [5]:
x_train = x_train.transpose(0, 2, 1)            # transpose working correctly
x_test  = x_test.transpose(0, 2, 1)
print(x_train.shape)
print(x_test.shape)

(17420, 1, 1000)
(4381, 1, 1000)


In [6]:
x_train = x_train.reshape(17420, 1, 1000, 1)   # Add another channel
x_test  = x_test.reshape(4381, 1, 1000, 1)

In [7]:
print("x_train :", x_train.shape)
print("y_train :", y_train.shape)
print("x_test  :", x_test.shape)
print("y_test  :", y_test.shape)
print('Data loaded')

# Old OUTPUTS:
# (19601, 1000, 12)
# (19601, 12, 1000)
# x_train : (19601, 12, 1000, 1)
# y_train : (19601,)
# x_test  : (2198, 12, 1000, 1)
# y_test  : (2198,)
# Data loaded

x_train : (17420, 1, 1000, 1)
y_train : (17420,)
x_test  : (4381, 1, 1000, 1)
y_test  : (4381,)
Data loaded


In [8]:
# x_train = x_train[:2000]
# x_test = x_test[:500]
# y_train = y_train[:2000]
# y_test = y_test[:500]

In [9]:
x_test.shape

(4381, 1, 1000, 1)

In [10]:

from sklearn.preprocessing import MultiLabelBinarizer
# Convert multi-label target labels to one-hot encoded matrix
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)
print("Classes:", mlb.classes_)

Classes: ['CD' 'HYP' 'MI' 'NORM' 'STTC']


In [11]:
from tensorflow.keras import layers, regularizers
from tensorflow.keras.models import Model

# Main Version
input = layers.Input(shape=(1, 1000, 1))

X = layers.Conv2D(filters=32, kernel_size=(1, 5), padding='same')(input)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.MaxPooling2D(pool_size=(1, 2), strides=1, padding='same')(X)

convC1 = layers.Conv2D(filters=64, kernel_size=(1, 7), padding='same')(X)

X = layers.Conv2D(filters=32, kernel_size=(1, 5), padding='same')(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.MaxPooling2D(pool_size=(1, 4), strides=1, padding='same')(X)

convC2 = layers.Conv2D(filters=64, kernel_size=(1, 6), padding='same')(convC1)

X = layers.Conv2D(filters=64, kernel_size=(1, 5), padding='same')(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
residual_1 = layers.Add()([convC2, X])           # skip Connection
X = layers.ReLU()(residual_1)
X = layers.MaxPooling2D(pool_size=(1, 2), strides=1, padding='same')(X)

convE1 = layers.Conv2D(filters=32, kernel_size=(1, 4), padding='same')(X)

X = layers.Conv2D(filters=64, kernel_size=(1, 3), padding='same')(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.MaxPooling2D(pool_size=(1, 4), strides=1, padding='same')(X)

convE2 = layers.Conv2D(filters=64, kernel_size=(1, 5), padding='same')(convE1)

X = layers.Conv2D(filters=64, kernel_size=(1, 3), padding='same')(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
residual_2 = layers.Add()([convE2, X])         # skip Connection
X = layers.ReLU()(residual_2)
X = layers.MaxPooling2D(pool_size=(1, 2), strides=1, padding='same')(X)
print('Added 5 layers for temporal analysis')

# Spatial Analysis
X = layers.Conv2D(filters=64, kernel_size=(12, 1), padding='same')(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.GlobalAveragePooling2D()(X)
print('Added 1 layer for spatial Analysis')

# Fully Connected Layers
X = layers.Flatten()(X)
X = layers.Dense(units=128, kernel_regularizer=regularizers.L2(0.005))(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.Dropout(rate=0.3)(X)

X = layers.Dense(units=64, kernel_regularizer=regularizers.L2(0.009))(X)
X = layers.BatchNormalization()(X)
X = layers.ReLU()(X)
X = layers.Dropout(rate=0.3)(X)
print('Added 2 fully connected layers')

# Output Layer
output = layers.Dense(5, activation='sigmoid')(X)

# Define the model
model = Model(inputs=input, outputs=output)
print(model.summary())


2024-02-21 15:52:31.804288: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1457 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:5e:00.0, compute capability: 7.5


Added 5 layers for temporal analysis
Added 1 layer for spatial Analysis
Added 2 fully connected layers
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1, 1000, 1)]         0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 1, 1000, 32)          192       ['input_1[0][0]']             
                                                                                                  
 batch_normalization (Batch  (None, 1, 1000, 32)          128       ['conv2d[0][0]']              
 Normalization)                                                                                   
                                                                                          

In [12]:

early    = callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True)
reducelr = callbacks.ReduceLROnPlateau(monitor="val_loss", patience=3)

callback = [early, reducelr]

model.compile(optimizer = optimizers.Adam(learning_rate=0.001),
              loss = losses.BinaryCrossentropy(),
              metrics = [metrics.BinaryAccuracy(), metrics.AUC(curve='ROC', multi_label=True)])

history = model.fit(x_train, y_train, validation_split=0.12, epochs=20, batch_size=64, callbacks=callback)
# history = model.fit(x_train, y_train, validation_split=0.10, epochs=25, batch_size=64, callbacks=callback)

Epoch 1/20


2024-02-21 15:52:36.762976: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8600
2024-02-21 15:52:36.966389: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-02-21 15:52:37.147297: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-02-21 15:52:37.458201: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.05GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-02-21 15:52:39.970386: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x1551340b8870 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-21 15:52:39.970430: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
save_path = "/global/D1/homes/jayao/XAI-Based-ECG-Diagnostics-main/model/"
model.save(save_path + "ST-CNN-5_lead3new.h5")

  saving_api.save_model(


In [14]:
# Assuming you have a trained model 'model'
# Make predictions on the test set
y_pred = model.predict(x_test)

# Assuming y_pred is in the probability format (values between 0 and 1)
# Convert the probabilities to binary predictions using a threshold (e.g., 0.5)
y_pred_binary = (y_pred >= 0.5).astype(int)

# Convert the binary predictions back to class labels using MultiLabelBinarizer
predicted_classes = mlb.inverse_transform(y_pred_binary)

# Display the predicted classes for the first few samples
for i in range(5):
    print(f"Sample {i + 1}: Predicted Classes - {predicted_classes[i]}")


Sample 1: Predicted Classes - ()
Sample 2: Predicted Classes - ('NORM',)
Sample 3: Predicted Classes - ('NORM',)
Sample 4: Predicted Classes - ('CD',)
Sample 5: Predicted Classes - ()


In [15]:
# Assuming you have a trained model 'model'
# Make predictions on the test set
y_pred = model.predict(x_test)

# Assuming y_pred is in the probability format (values between 0 and 1)
# Convert the probabilities to binary predictions using a threshold (e.g., 0.5)
y_pred_binary = (y_pred >= 0.5).astype(int)

# Convert the binary predictions back to class labels using MultiLabelBinarizer
predicted_classes = mlb.inverse_transform(y_pred_binary)

# Use a set to accumulate all distinct classes
distinct_classes = set()

# Iterate over predicted classes and add them to the set
for classes in predicted_classes:
    distinct_classes.update(classes)

# Display all distinct classes
print("All Distinct Classes:", distinct_classes)


All Distinct Classes: {'HYP', 'CD', 'NORM', 'STTC', 'MI'}


In [16]:
from tensorflow.keras.models import load_model
model = load_model(r'/global/D1/homes/jayao/XAI-Based-ECG-Diagnostics-main/model/ST-CNN-5_lead3new.h5')
y_pred_train = model.predict(x_train)
y_pred_test  = model.predict(x_test)



In [17]:
from sklearn.metrics import classification_report, precision_recall_curve, f1_score, roc_auc_score, accuracy_score, auc
import numpy as np

def sklearn_metrics(y_true, y_pred, mlb):
    y_bin = np.copy(y_pred)
    y_bin[y_bin >= 0.5] = 1
    y_bin[y_bin < 0.5]  = 0

    # Compute area under precision-Recall curve
    auc_sum = 0
    for i in range(y_true.shape[1]):
        precision, recall, thresholds = precision_recall_curve(y_true[:, i], y_pred[:, i])
        auc_sum += auc(recall, precision)

    print("Accuracy        : {:.2f}".format(accuracy_score(y_true.flatten(), y_bin.flatten()) * 100))
    print("Macro AUC score : {:.2f}".format(roc_auc_score(y_true, y_pred, average='macro') * 100))
    print('AUROC           : {:.2f}'.format((auc_sum / y_true.shape[1]) * 100))
    print("Micro F1 score  : {:.2f}".format(f1_score(y_true, y_bin, average='micro') * 100))

    # Convert binary predictions back to class labels using MultiLabelBinarizer
    predicted_classes = mlb.inverse_transform(y_bin)

    # Use a set to accumulate all distinct classes
    distinct_classes = set()

    # Iterate over predicted classes and add them to the set
    for classes in predicted_classes:
        distinct_classes.update(classes)

    # Convert the set of distinct classes to a sorted list
    class_names = sorted(list(distinct_classes))

    # Print classification report for each class
    print("\nClassification Report:")
    print(classification_report(y_true, y_bin, target_names=class_names))

# Assuming mlb is the MultiLabelBinarizer used for transforming the labels
sklearn_metrics(y_test, y_pred_test, mlb)


Accuracy        : 81.10
Macro AUC score : 79.11
AUROC           : 57.02
Micro F1 score  : 56.02

Classification Report:
              precision    recall  f1-score   support

          CD       0.79      0.44      0.57       992
         HYP       0.64      0.03      0.05       530
          MI       0.70      0.40      0.51      1092
        NORM       0.68      0.83      0.75      1919
        STTC       0.56      0.15      0.23      1049

   micro avg       0.69      0.47      0.56      5582
   macro avg       0.67      0.37      0.42      5582
weighted avg       0.68      0.47      0.51      5582
 samples avg       0.56      0.51      0.52      5582



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
# Accuracy        : 80.38
# Macro AUC score : 76.58
# AUROC           : 53.55
# Micro F1 score  : 51.97

# Classification Report:
#               precision    recall  f1-score   support

#           CD       0.79      0.36      0.49       992
#          HYP       0.50      0.00      0.01       530
#           MI       0.73      0.34      0.46      1092
#         NORM       0.67      0.80      0.73      1919
#         STTC       0.54      0.06      0.11      1049

#    micro avg       0.69      0.42      0.52      5582
#    macro avg       0.65      0.31      0.36      5582
# weighted avg       0.66      0.42      0.45      5582
#  samples avg       0.51      0.46      0.47      5582