In [None]:
# =============================================================================
# ECG Signal Processing and Feature Extraction for Apnea Detection
# =============================================================================
# - This script processes ECG signals from the Apnea ECG dataset.
# - It splits the signal into 120-second intervals (similar procedures can be
#   done for 60s and 90s intervals).
# - It cleans the data by removing segments with poor signal quality,
#   extracts HRV features using (a slightly modified) NeuroKit2 package, and saves the processed data.
# - Original ECG data can be downloaded from:
#   https://physionet.org/content/apnea-ecg/1.0.0/
#
# Usage:
# - Ensure the required packages are installed (see requirements.txt).
# - Update the file paths (e.g., "C:\Users\piotr\Desktop\PSG data\Apnea ECG\")
#   to match your system.
# =============================================================================

In [None]:
# Uncomment the line below if you need to install dependencies.
# %pip install -r requirements.txt 
# ----------------------------
# Import Required Libraries
# ----------------------------
# We import all necessary libraries
import wfdb
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import neurokit2 as nk
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")


In [65]:
def data_creation(record_path):
    """
    Process an ECG record for apnea detection.
    
    Parameters:
        record_path (str): Path to the ECG record (without file extension).
    
    Returns:
        features (np.array): Extracted HRV features for each segment.
        labels (np.array): Corresponding labels (0 for no apnea, 1 for apnea).
    """
    # Read annotations (apnea markers) from the record.
    annotation = wfdb.rdann(record_path, extension='apn')
    
    # Get the sampling frequency
    sampling_rate = wfdb.rdrecord(record_path).fs  
    
    # Get annotation times in seconds
    annotation_times = np.array(annotation.sample) / sampling_rate  
    
    # Map annotation symbols: "N" indicates no apnea (0), others indicate apnea (1)
    annotation_labels = np.where(np.array(annotation.symbol)=="N",0, 1).astype(np.int32)
    # Get the signal
    signal,dic = wfdb.rdsamp(record_path)
    signal=signal.flatten()
    # Discard the first and last annotation
    annotation_times = annotation_times[1:-1]  
    annotation_labels = annotation_labels[1:-1]
    df=pd.DataFrame()
    # Keep track of indices of poor-quality segments.
    bad_times=[]
    # Process each annotation segment.
    for i in range(len(annotation_times)):
        try:
            # Define the window: 60 seconds before and after the annotation time and clean it.
            time=int(annotation_times[i])*sampling_rate
            clean = nk.ecg_clean(signal[time-60*sampling_rate:time+60*sampling_rate], sampling_rate=sampling_rate)
            
            # Evaluate ECG quality (mean quality score).
            quality = np.mean(nk.ecg_quality(clean, sampling_rate=sampling_rate))
            
            if quality < 0.5:
                bad_times.append(i)  # Mark as bad quality
                print(f"Warning: ECG quality check failed at index {i}. Marking as bad data")
            else:
                # Extract features from cleaned ECG
                peaks = nk.ecg_peaks(clean, sampling_rate=sampling_rate)
                features = nk.hrv(peaks[0], sampling_rate=sampling_rate)  
                df = pd.concat([df, features])
        except Exception as e:
            # If any error occurs, consider this segment as bad
            bad_times.append(i)
            print(f"Warning: ECG quality check failed at index {i}. Marking as bad data. Error: {e}")
    # Convert features to a NumPy array.
    features= df.to_numpy()
    # Remove labels corresponding to bad segments.
    labels=np.delete(annotation_labels,bad_times)
    return features,labels

def clean_features(features, labels):
    """
    Remove columns containing only NaNs. Then, remove rows that contain a NaN.
    """
    features[np.isinf(features)] = np.nan
    nan_cols = np.all(np.isnan(features), axis=0)
    clean_features = features[:, ~nan_cols]
    nan_rows = np.isnan(clean_features).any(axis=1)
    nan_indexes = np.where(nan_rows)[0]
    features_cleaned = clean_features[~nan_rows]
    labels_cleaned = np.delete(labels, nan_indexes)
    return features_cleaned, labels_cleaned

In [None]:
# For each patient we extract the whole signal, split it into 120s intervals and extract features. This is for the training data.
list_of_features_train=[]
list_of_labels_train=[]
list_of_file_names=open(r"Extracted Features and supplemental files\list_train").read().split("\n")
for i in list_of_file_names:
    # You need to change the record_path to match where your Apnea ECG data set is.
    record_path=fr"C:\Users\piotr\Desktop\PSG data\Apnea ECG\{i}"
    features,labels = data_creation(record_path)
    list_of_features_train.append(features)
    list_of_labels_train.append(labels)
extracted_features_train=np.concatenate(list_of_features_train)
extracted_labels_train=np.concatenate(list_of_labels_train)

In [None]:
# Clean the training features.
features_cleaned_train, labels_cleaned_train= clean_features(extracted_features_train, extracted_labels_train)

In [None]:
# Save processed training data.
np.save("features_cleaned_120s_train",features_cleaned_train)
np.save("labels_cleaned_120s_train",labels_cleaned_train)

In [None]:
#We repeat the whole procedure for the testing data.
list_of_features_test=[]
list_of_labels_test=[]
list_of_file_names=open(r"Extracted Features and supplemental files\list_test").read().split("\n")
for i in list_of_file_names:
    # You will need to change the record path to where the Apnea ECG data set is located.
    record_path=fr"C:\Users\piotr\Desktop\PSG data\Apnea ECG\{i}"
    features,labels = data_creation(record_path)
    list_of_features_test.append(features)
    list_of_labels_test.append(labels)
extracted_features_test=np.concatenate(list_of_features_test)
extracted_labels_test=np.concatenate(list_of_labels_test)

In [None]:
features_cleaned_test, labels_cleaned_test = clean_features(extracted_features_test, extracted_labels_test)

In [None]:
np.save("features_cleaned_120s_test",features_cleaned_test)
np.save("labels_cleaned_120s_test",labels_cleaned_test)

In [None]:
# For demonstration: simulate an ECG signal to obtain HRV feature names.
signal = nk.ecg_simulate(duration=120, sampling_rate=100)
peaks = nk.ecg_peaks(signal, sampling_rate=100)
features = nk.hrv(peaks[0], sampling_rate=100)
# Only keep feature names that do not have any missing values.
features_names = features.columns[~features.isna().any()].tolist()

In [62]:
with open("Feature Names", "wb") as fp:   #Pickling
    pickle.dump(features_names, fp)