In [10]:
from __future__ import division, print_function
import os
from tqdm import tqdm
import numpy as np
import random
from config import get_config
from wfdb import rdrecord, rdann
from scipy.signal import find_peaks
import h5py
from scipy.signal import find_peaks
from sklearn.preprocessing import scale
from collections import defaultdict


In [11]:
def ensure_directory_exists(file_path):
    # Extracts the directory path from the given file path.
    directory = os.path.dirname(file_path)
    
    # Checks if the directory does not exist.
    if not os.path.exists(directory):
        # Creates the directory along with any necessary intermediate directories.
        os.makedirs(directory)

def preprocess(split):
    # List of numbers representing different ECG recordings or patients.
    nums = ['100', '101', '102', '103', '104', '105', '106', '107', '108', '109',
            '111', '112', '113', '114', '115', '116', '117', '118', '119',
            '121', '122', '123', '124', '200', '201', '202', '203', '205',
            '207', '208', '209', '210', '212', '213', '214', '215', '217', '219',
            '220', '221', '222', '223', '228', '230', '231', '232', '233', '234']
    
    # List of ECG lead features to be considered in the dataset.
    features = ['MLII', 'V1', 'V2', 'V3', 'V4', 'V5']
    
    # If 'split' is True, divide the data into training and testing sets.
    if split:
        # Define specific recordings as the test set.
        testset = ['101', '105', '114', '118', '124', '201', '210', '217']
        # The training set consists of all other recordings not in the test set.
        trainset = [x for x in nums if x not in testset]
        
        # Process and save the training and test sets to HDF5 files.
        dataSaver(trainset, 'dataset/train.hdf5', 'dataset/trainlabel.hdf5', features)
        dataSaver(testset, 'dataset/test.hdf5', 'dataset/testlabel.hdf5', features)
    else:
        # For a single sample processing scenario.
        num = 'sample_num'  
        dataSaver(num, 'dataset/targetdata.hdf5', 'dataset/labeldata.hdf5', features)

def dataSaver(dataSet, datasetname, labelsname, features):
    classes = ['N', 'V', '/', 'A', 'F', '~']
    Nclass = len(classes)
    datadict, datalabel = {}, {}
    n_class_counter = 0  # Counter for 'N' class instances

    for feature in features:
        datadict[feature] = []
        datalabel[feature] = []

def dataSaver(dataSet, datasetname, labelsname, features):
    # List of ECG beat annotations to be classified.
    classes = ['N', 'V', '/', 'A', 'F', '~']
    # Total number of classes.
    Nclass = len(classes)
    # Dictionaries to hold the feature data and corresponding labels.
    datadict, datalabel = {}, {}
    # Counter for instances of the 'N' class (normal beats).
    n_class_counter = 0
    
    for feature in features:
        # Initialize lists to hold data and labels for each feature.
        datadict[feature] = []
        datalabel[feature] = []
    
    def dataprocess():
        nonlocal n_class_counter  # Allows modification of 'n_class_counter' inside this function.
        input_size = 256  # Size of the data window around each ECG beat.
        for num in dataSet:
            # Placeholder for actual data loading and preprocessing steps.
            record = rdrecord('dataset/' + num, smooth_frames=True)
            signals = scale(np.nan_to_num(record.p_signal), axis=0)
            peaks, _ = find_peaks(signals[:, 0], distance=150)
            
            for peak in peaks[1:-1]:
                start, end = peak - input_size // 2, peak + input_size // 2
                if start < 0 or end > len(signals):
                    continue  # Skip this segment if it goes out of bounds.
                # Placeholder for annotation loading.
                ann = rdann('dataset/' + num, extension='atr', sampfrom=start, sampto=end, return_label_elements=['symbol'])
                annSymbol = ann.symbol
                
                # If the annotation is one of the specified classes.
                if len(annSymbol) == 1 and (annSymbol[0] in classes):
                    # Special handling for the 'N' class to prevent imbalance.
                    if annSymbol[0] == 'N' and n_class_counter >= 20000:
                        continue
                    if annSymbol[0] == 'N':
                        n_class_counter += 1
                    
                    # Create a one-hot encoded label for the annotation.
                    y = [0] * Nclass
                    y[classes.index(annSymbol[0])] = 1
                    # Save the segment and its label for each feature.
                    for feature_idx, feature in enumerate(features):
                        if feature_idx < signals.shape[1]:  # Ensure feature index is valid.
                            datadict[feature].append(signals[start:end, feature_idx])
                            datalabel[feature].append(y)
    
    # Call the inner function to process data.
    dataprocess()

    # Convert lists to numpy arrays for efficient storage and manipulation.
    for feature in features:
        datadict[feature] = np.array(datadict[feature], dtype=np.float32)
        datalabel[feature] = np.array(datalabel[feature], dtype=np.int32)

    # Ensure the directories for the dataset and labels exist.
    ensure_directory_exists(datasetname)
    ensure_directory_exists(labelsname)
    
    # Save the processed data and labels to HDF5 files.
    with h5py.File(datasetname, 'w') as hdf_file, h5py.File(labelsname, 'w') as label_file:
        for feature in features:
            hdf_file.create_dataset(feature, data=datadict[feature])
            label_file.create_dataset(feature, data=datalabel[feature])
    
    print(f"Data saved to {datasetname} and {labelsname}.")
def main(config):
    # Call the preprocess function with the configuration provided.
    preprocess(config.split)

if __name__ == "__main__":
    # Obtain configuration settings (not shown in your script).
    config = get_config()
    # Start the main function with the obtained configuration.
    main(config)

Data saved to dataset/train.hdf5 and dataset/trainlabel.hdf5.
Data saved to dataset/test.hdf5 and dataset/testlabel.hdf5.
