In [1]:
import wfdb, wfdb.processing, heartpy
from biosppy.signals import ecg
from numpy import ndarray
from pandas import read_csv as rcsv

import json, re
from time import time as t
from datetime import datetime as dt

from wf_preproc import show_ann_label, get_db, extract_ann, extract_rdheader

EDA on VFDB: https://www.kaggle.com/code/kooaslansefat/eda-on-mit-bih-malignant-ventricular-fibrill-db

In [2]:
# wfdb.io.show_ann_labels()
# wfdb.io.show_ann_classes()

In [3]:
# sample_dataset = rcsv("sample_p10_sensor_outputs.csv")
# (6000 / sample_dataset["HR (bpm)"]).tolist() # 6000 (in miliseconds)

* "vfdb" = **MIT-BIH Malignant Ventricular Ectopy Database**
* "nsrdb" = **MIT-BIH Normal Sinus Rhythm Database**

In [10]:
dataset = get_db(key="vfdb")
db, records = dataset["db"], dataset["records"]

DB_NAME: MIT-BIH Malignant Ventricular Ectopy Database
TOTAL_RECORDS: 22


In [5]:
header = rcsv("data/vfdb-rdheader.csv")
header

Unnamed: 0,r_name,sig_len,sampling_freq,created_at
0,418,525000,250,1990-01-04 00:00:00
1,419,525000,250,1990-01-04 00:00:00
2,420,525000,250,1990-01-04 00:00:00
3,421,525000,250,1990-01-04 00:00:00
4,422,525000,250,1990-01-04 00:00:00
5,423,525000,250,1990-01-04 00:00:00
6,424,525000,250,1990-01-04 00:00:00
7,425,525000,250,1990-01-04 00:00:00
8,426,525000,250,1990-01-04 00:00:00
9,427,525000,250,1990-01-04 00:00:00


In [6]:
annotations = rcsv("data/vfdb-rdann.csv")

# preprocess the annotation label
annotations.annot_aux = annotations.annot_aux.apply(
    lambda x: x.replace('(', '').replace('NSR', 'N').replace('VFIB', 'VF')
)

# set annotations index "from" and "to"
annotations.rename(columns={"annot_idx" : "annot_idx_from"}, inplace=True)
annotations.insert(
    column = "annot_idx_to",
    loc = len(annotations.columns)-1,
    value = annotations.groupby('r_name').annot_idx_from.shift(periods = -1, fill_value = -1).astype(int)
)

# remove last row from each r_name
annotations = annotations[annotations.annot_idx_to != -1]

In [11]:
r = wfdb.rdrecord(records[0], channels=[0, 1], pn_dir=db)
a = wfdb.rdann(record_name=records[0], pn_dir=db, extension='atr')

In [12]:
a.__dict__

{'record_name': '418',
 'extension': 'atr',
 'sample': array([    18,  99624, 101499, 133092, 134038, 135775, 136628, 153057,
        154115, 154942, 156291, 159442, 160516, 169192, 169807, 173054,
        173673, 174788, 175403, 176259, 177868, 190080, 191249, 191807,
        192695, 195631, 196794, 200211, 200634, 216788, 219038, 219961,
        224019, 225355, 226057, 227211, 229269, 231310, 232724, 234499,
        235538, 254230, 255365, 256019, 256884, 257249, 257980, 259557,
        261903, 262749, 263519, 269307, 270999, 271326, 271596, 272057,
        272384, 273673, 275846, 279576, 281384, 289384, 291211, 300480,
        301076, 301525, 301711, 302230, 302538, 302871, 303346, 303641,
        304634, 311442, 311807, 312096, 313480, 313826, 314173, 314493,
        316807, 317237, 317480, 318682, 319769, 327480, 329038, 329365,
        329749, 333826, 334211, 339769, 340019, 346634, 347038, 347403,
        347730, 357634, 358403, 362711, 363115, 367749, 368134, 368583,
        36

In [76]:
wfdb.plot_wfdb(record=r, annotation=a, time_units="samples")

Exception: The length of the ylabel must be the same as the signal: 3 values

* remember: f = N/t, where f = sampling frequency, N = signal length, t = time
* time_per_second = signal_len / sampling_freq = 525000/250 = 2100

In [9]:
# record_num = 418
# for _, i in annotations[annotations.r_name == record_num].iterrows():
#     if _ > 50:
#         FS = header[header.r_name == record_num].sampling_freq.squeeze()
#         from_idx = i.annot_idx_from
#         to_idx = i.annot_idx_to

#         ann = wfdb.rdann(record_name=str(record_num), sampfrom=from_idx, sampto=to_idx+FS, extension="atr", pn_dir=db)
#         print(ann.sample)
#         rr_interval = wfdb.processing.calc_rr(ann.sample, fs=FS)
#         rr_interval = np.insert(rr_interval, 0, ann.sample[0]) if from_idx == 1 else rr_interval
#             # np.insert() is used to add the first annotation sample (i.e., ann.sample[0]), only when annot_idx = 1
#         rr_interval = rr_interval # / ann.fs # normalize
#         print("f={}, t={} {}".format(from_idx, to_idx, rr_interval))

In [10]:
def get_record_start_partitions(
    record : str,
    db : str,
    min_duration : int,
    freq_rate : int,
    resampling : int = 0,
    *args, **kwargs
) -> dict:
    
    def get_label(annotation):
        p = re.compile("([A-Za-z]+)")
        return p.search(annotation)[1]
    
    start_sample = (min_duration * 60) * freq_rate # freq = N / time(s), therefore N = freq x time
    annotation = wfdb.rdann(record, 'atr', pn_dir=db, sampfrom=start_sample)
    signals, _ = wfdb.rdsamp(record, pn_dir=db, sampfrom=start_sample)
    
    if resampling != 0:
        start_sample = (min_duration * 60) * resampling
        signals, annotation = wfdb.processing.resample_multichan(
            signals, annotation, freq_rate, resampling)
    
    annotations = [get_label(a) for a in annotation.aux_note]
    annotations = list(map(lambda x: x.replace('NSR', 'N').replace('VFIB', 'VF'), annotations))
    
    positive_labels = ["VT", "VF", "VFL"]
    negative_labels = ["N"] # normal rhythm
    
    results_pos = []
    results_neg = []
    
    for ann, annot_sample in zip(annotations, annotation.sample):
        if ann in positive_labels:
            results_pos.append(annot_sample-start_sample) # n sample before events
        elif ann in negative_labels:
            results_neg.append(annot_sample-start_sample)
    
    results_pos = [r for r in results_pos if r > 0] # remove non-negative values
    results_neg = [r for r in results_neg if r > 0] # remove non-negative values
    
    return {"positive" : results_pos, "negative" : results_neg}

### Preprocessing (R-R Interval)

In [11]:
class dotdict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [12]:
class SignalPreprocessing():
    def __init__(self, record_idx : str, db : str, *args, **kwargs) -> None:
        super(SignalPreprocessing, self).__init__()
        self.record_idx = record_idx
        self.db = db
        
        # record
        self.r = wfdb.rdrecord(record_name=self.record_idx, pn_dir=self.db)
        
        # annotation
        self.a = wfdb.rdann(record_name=self.record_idx, pn_dir=self.db, extension='atr')
        
        self.signals = self.r.p_signal # can be multi-dimensional channels
        self.channels = self.r.sig_name
        self.freq_rate = self.r.fs
        
    def extract_rpeaks(self, signal, *args, **kwargs) -> ndarray:
        # segment
        rpeaks, = ecg.engzee_segmenter(
            signal=signal,
            sampling_rate=self.freq_rate
        )

        # correct R-peak locations
        rpeaks, = ecg.correct_rpeaks(
            signal=signal, rpeaks=rpeaks,
            sampling_rate=self.freq_rate, tol=.05
        )

        # extract templates
        _, rpeaks = ecg.extract_heartbeats(
            signal=signal, rpeaks=rpeaks,
            sampling_rate=self.freq_rate, before=.2, after=.4
        )
        
        return rpeaks
        
    def filter_signal(self, signal, low_freq, high_freq, *args, **kwargs) -> ndarray:
        return heartpy.filter_signal(
            signal, filtertype='bandpass',
            cutoff=[low_freq, high_freq], sample_rate=self.freq_rate
        )
    
    def get_peaklist(self,
        start_partitions : list,
        duration : int,
        label : int,
        resampling : int = 0,
        *args, **kwargs
    ) -> dict:
        '''
        Example return:
        return_example = {
            "peaks" : {
                1000 : [
                    {"channel" : "ECG1", "value" : [20, 30, 40]},
                    {"channel" : "ECG2", "value" : [50, 60, 70]},
                ],
                2500 : [
                    {"channel" : "ECG1", "value" : [55, 66, 77]},
                    {"channel" : "ECG2", "value" : [75, 85, 96]},
                ]
            }, 
            "created_at" : "2024-02-15", 
            "exc_time" : 50.184
        }
        '''
        
        # to calculate exc. time (in seconds)
        start_time = t()
        
        # reassign variables since they can be overrided if resampling != 0
        fr = self.freq_rate
        signals = self.signals
        
        # resampling signal (if any)
        if resampling != 0:
            resampled_signals, _ = wfdb.processing.resample_multichan(
                self.signals, self.a, fr, resampling
            )
            fr = resampling
            signals = resampled_signals
            
            # if you did resampling, partitions index should be adjusted
            # based on the newest max. signal length.
            start_partitions = [i for i in start_partitions if i <= len(signals)]
        
        FINAL_RESULTS, PARTITION = {}, {}
        for start_partition in start_partitions:
            delta = (duration * 60) * fr # remember, freq = N / time(s), therefore N = freq x time
            curr_signals = signals[start_partition:start_partition+delta, :]
            
            PEAKS_CHANNEL = []
            for i, channel in enumerate(self.channels):
                signal = curr_signals[:, i]
                signal = self.filter_signal(signal, 7, 30) # filtered
                peaks = self.extract_rpeaks(signal)
                peaks = [int(p) for p in peaks] # convert from np.int32 to INT
                PEAKS_CHANNEL.append(dotdict({"channel" : channel, "value" : peaks}))
            PARTITION[int(start_partition)] = PEAKS_CHANNEL
        
        FINAL_RESULTS["peaks"] = PARTITION
        FINAL_RESULTS["label"] = label
        FINAL_RESULTS["exc_time"] = round(t()-start_time, 3)
        FINAL_RESULTS["created_at"] = dt.now().strftime("%Y-%m-%d %X")
        
        return dotdict(FINAL_RESULTS)

In [83]:
def create_dataset(records, header):
    for r in records:
        FS = header[header.r_name == int(r)].sampling_freq.squeeze()
        st = get_record_start_partitions(record=r, db=db, min_duration=5, freq_rate=FS)
        proc = SignalPreprocessing(record_idx=r, db=db)

        # Positive label
        partition_pos = st["positive"]
        result_pos = proc.get_peaklist(start_partitions=partition_pos, duration=5, label=1)
        with open("data/positive/{}.json".format(r), "w") as outfile: 
            json.dump(result_pos, outfile)
        print("Record {} (positive) was completed.".format(r))

        # Negative label
        partition_neg = st["negative"]
        result_neg = proc.get_peaklist(start_partitions=partition_neg, duration=5, label=0)
        with open("data/negative/{}.json".format(r), "w") as outfile: 
            json.dump(result_neg, outfile)  
        print("Record {} (negative) was completed.".format(r))

### Read files

In [14]:
from heartpy import analysis
import hrvanalysis as hrva
import os

In [15]:
dataset_pos_filepath = ["data/positive/{}".format(p) for p in os.listdir("data/positive") if p.endswith(".json")]
dataset_neg_filepath = ["data/negative/{}".format(p) for p in os.listdir("data/negative") if p.endswith(".json")]

In [16]:
def rr_preproc(rr_interval : list) -> list:
    nn_interval = hrva.remove_outliers(rr_intervals=rr_interval, verbose=False)

    # @param method: "malik", "kamath", "karlsson", "acar"
    nn_interval = hrva.remove_ectopic_beats(rr_intervals=nn_interval, method="malik", verbose=False)

    # @param interpolation_method: 'linear', 'time', 'index', 'values', 'nearest', 'zero', 'slinear',
    # 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives',
    # 'piecewise_polynomial', 'pchip', 'akima', 'cubicspline'
    nn_interval = hrva.interpolate_nan_values(rr_intervals=nn_interval, interpolation_method="cubic")

    # remove NaN values which weren't filtered during interpolation; e.g., in the last index
    nn_interval = [i for i in nn_interval if str(i) != "nan"]
    
    return nn_interval

In [59]:
DF_RECORDS = []
for filepath in (dataset_pos_filepath+dataset_neg_filepath): # LOOP per file
    with open(filepath) as file:
        data = json.load(fp=file)

    for idx in data["peaks"].keys(): # LOOP per start_partition
        for ch_num, ch in enumerate(data["peaks"][idx]): # LOOP per channel
            peaklist = ch["value"]
            rr = analysis.calc_rr(peaklist=peaklist, sample_rate=250)
            nn_interval = rr_preproc(rr_interval=rr["RR_list"])

            FEATURES = {
                "record_id" : int(filepath.split("/")[-1].split(".json")[0]),
                "start_partition_idx" : idx,
                "channel" : ch["channel"] + "_{}".format(str(ch_num))
            }

            # Reference:
            # 1. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5624990/
            # 2. https://aura-healthcare.github.io/hrv-analysis/hrvanalysis.html
            
            # TIME DOMAIN
            ftr_time_domain = hrva.get_time_domain_features(nn_interval)
            FEATURES.update(ftr_time_domain)

            ftr_geometric_time_domain = hrva.get_geometrical_features(nn_interval)
            FEATURES.update(ftr_geometric_time_domain)

            # Frequency Domain
            ftr_freq_domain = hrva.get_frequency_domain_features(nn_interval)
            FEATURES.update(ftr_freq_domain)

            # Non-linear Domain
            ftr_entropy = hrva.get_sampen(nn_interval) # sample entropy
            FEATURES.update({"entropy" : ftr_entropy["sampen"]})

            ftr_poincare = hrva.get_poincare_plot_features(nn_interval)
            FEATURES.update(ftr_poincare)

            # CVI (Cardiac Sympathetic Index), CSI (Cardiac Vagal Index)
            ftr_csi_cvi = hrva.get_csi_cvi_features(nn_interval)
            FEATURES.update(ftr_csi_cvi)
            
            FEATURES.update({"label" : int(data["label"])}) # set label
            DF_RECORDS.append(FEATURES)

In [13]:
from pandas import DataFrame as df, read_csv as rcsv

In [14]:
# vfdb_dataset = df(DF_RECORDS)
# vfdb_dataset.to_csv("data/vfdb_features.csv", index=False)

In [15]:
vfdb_dataset = rcsv("data/vfdb_features.csv")

In [16]:
vfdb_dataset.columns

Index(['record_id', 'start_partition_idx', 'channel', 'mean_nni', 'sdnn',
       'sdsd', 'nni_50', 'pnni_50', 'nni_20', 'pnni_20', 'rmssd', 'median_nni',
       'range_nni', 'cvsd', 'cvnni', 'mean_hr', 'max_hr', 'min_hr', 'std_hr',
       'triangular_index', 'tinn', 'lf', 'hf', 'lf_hf_ratio', 'lfnu', 'hfnu',
       'total_power', 'vlf', 'entropy', 'sd1', 'sd2', 'ratio_sd2_sd1', 'csi',
       'cvi', 'Modified_csi', 'label'],
      dtype='object')

In [17]:
from sklearn.feature_selection import RFE

In [50]:
X = vfdb_dataset[['mean_nni', 'rmssd']]

In [27]:
# X = vfdb_dataset.drop([
#     "record_id", "start_partition_idx", "channel", "tinn", "sdsd", "nni_50",
#     "pnni_50", "nni_20", "pnni_20", "median_nni", "range_nni", "cvsd", "cvnni",
#     "max_hr", "min_hr", "lf", "hf", "lfnu", "hfnu", "sd1", "sd2", "Modified_csi", "label"
# ], axis=1)

In [51]:
y = vfdb_dataset.label

In [41]:
# X.groupby("label").sample(n=10)

In [52]:
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import MinMaxScaler as mms
from sklearn.metrics import accuracy_score, auc, roc_curve, confusion_matrix as cfmat

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [53]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=.3, random_state=42)

In [54]:
scaler = mms()

In [55]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [65]:
model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
accuracy_score(y_pred, y_test)

0.6292682926829268

In [64]:
model = MLPClassifier(
    hidden_layer_sizes=(512, 256, 64, 32),
    batch_size=128,
    learning_rate_init=1e-5, learning_rate="adaptive",
    early_stopping=True, random_state=42, max_iter=1000
)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
accuracy_score(y_pred, y_test)

0.5804878048780487

In [66]:
tn, fp, fn, tp = cfmat(y_pred, y_test).ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = (2 * precision * recall) / (precision + recall)

print("Precision: {:.3f}".format(precision))
print("Recall: {:.3f}".format(recall))
print("F1-score: {:.3f}".format(f1_score))

Precision: 0.664
Recall: 0.687
F1-score: 0.675


In [67]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
auc(fpr, tpr)

0.6226304475278482

In [55]:
# feature selection using RFE

# import numpy as np

# model = RandomForestClassifier()
# a = RFE(estimator=model, n_features_to_select=5)
# selector = a.fit(X_train_scaled, y_train)
# selector.support_

# np.array(X.columns)[np.array(selector.support_)]

### Poincare and PSD plots

In [86]:
# from hrvanalysis import plot_psd, plot_poincare

# plot_poincare(nn_interval, plot_sd_features=True)
# plot_psd(nn_interval)