In [1]:
import wfdb
import numpy as np
import pandas as pd
from scipy.signal import butter, filtfilt
from scipy.stats import skew
import os

def read_mit_bih_record(record_name, data_dir='mit-bih-arrhythmia-database-1.0.0'):
    #Reads a single record's signal and annotation data from the database.
    try:
        record_path = os.path.join(data_dir, record_name)
        print(f"Processing record: {record_name}...")
        record = wfdb.rdrecord(record_path)
        annotation = wfdb.rdann(record_path, 'atr')
        return record, annotation 
        """
        Returns a tuple containing: 
        wfdb.Record: The record object with signal data and metadata. 
        wfdb.Annotation: The annotation object with heartbeat labels.
        """
    except FileNotFoundError:
        print(f"Error: Record files not found for '{record_name}'. Skipping.")
        return None, None
    except Exception as e:
        print(f"An unexpected error occurred for record '{record_name}': {e}. Skipping.")
        return None, None

def filter_ecg_signal(signal, fs, lowcut=0.5, highcut=45.0, order=4):
    """
    Applies a band-pass filter to the ECG signal and returns it as an np.array.
    """
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    filtered_signal = filtfilt(b, a, signal)
    return filtered_signal

def segment_heartbeats(signal, annotations, fs):
    """
    This function segments the continuous ECG signal into individual heartbeats.
    signal (np.array): The filtered ECG signal. 
    annotations (wfdb.Annotation): The annotation object.
    fs (int): The sampling frequency.

    It returns a tuple containing:
    A list of numpy arrays, where each array is a heartbeat segment.
    A list of the corresponding annotation symbols (labels) for each beat.
    A list of the sample indices of the R-peaks for the segmented beats.
    """
    before_samples = 100
    after_samples = 180
    
    segmented_beats = []
    beat_labels = []
    valid_r_peak_locations = []

    beat_annotation_symbols = ['N', 'L', 'R', 'B', 'A', 'a', 'J', 'S', 'V', 'r', 'F', 'e', 'j', 'n', 'E', '/', 'f', 'Q', '?']

    r_peak_locations = annotations.sample
    r_peak_symbols = annotations.symbol

    for i in range(len(r_peak_locations)):
        r_peak_loc = r_peak_locations[i]
        symbol = r_peak_symbols[i]

        if symbol in beat_annotation_symbols:
            start = r_peak_loc - before_samples
            end = r_peak_loc + after_samples

            if start >= 0 and end < len(signal):
                beat = signal[start:end]
                segmented_beats.append(beat)
                beat_labels.append(symbol)
                valid_r_peak_locations.append(r_peak_loc)
                
    return segmented_beats, beat_labels, valid_r_peak_locations

def extract_features(segmented_beats, valid_r_peaks, fs):
    """
    Extracts features from each segmented heartbeat.
    """
    all_features = []
    
    rr_intervals = np.diff(valid_r_peaks) / fs

    for i, beat in enumerate(segmented_beats):
        rr_prev = rr_intervals[i-1] if i > 0 else np.nan
        rr_next = rr_intervals[i] if i < len(rr_intervals) else np.nan
        
        r_peak_amp = beat[100]
        q_peak_amp = np.min(beat[:100])
        s_peak_amp = np.min(beat[100:])
        
        mean_val = np.mean(beat)
        std_val = np.std(beat)
        skew_val = skew(beat)
        
        features = {
            'rr_prev': rr_prev,
            'rr_next': rr_next,
            'r_peak_amp': r_peak_amp,
            'q_peak_amp': q_peak_amp,
            's_peak_amp': s_peak_amp,
            'mean': mean_val,
            'std': std_val,
            'skew': skew_val
        }
        all_features.append(features)
        
    return all_features


# Main execution
if __name__ == "__main__":
    # List of all 48 records in the database
    mit_bih_records = [
        '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', 
        '111', '112', '113', '114', '115', '116', '117', '118', '119', '121', 
        '122', '123', '124', '200', '201', '202', '203', '205', '207', '208', 
        '209', '210', '212', '213', '214', '215', '217', '219', '220', '221', 
        '222', '223', '228', '230', '231', '232', '233', '234'
    ]
    
    database_directory = 'mit-bih-arrhythmia-database-1.0.0'
    all_records_features = []

    # Loop through each record in the list
    for record_name in mit_bih_records:
        record_data, annotation_data = read_mit_bih_record(record_name, database_directory)

        if record_data and annotation_data:
            raw_ecg_signal = record_data.p_signal[:, 0]
            sampling_freq = record_data.fs

            filtered_ecg_signal = filter_ecg_signal(raw_ecg_signal, sampling_freq)
            segmented_beats, beat_labels, valid_r_peaks = segment_heartbeats(filtered_ecg_signal, annotation_data, sampling_freq)
            
            if not segmented_beats:
                print(f"No valid beats found for record {record_name}. Skipping.")
                continue

            features = extract_features(segmented_beats, valid_r_peaks, sampling_freq)
            
            features_df = pd.DataFrame(features)
            features_df['label'] = beat_labels
            features_df['record'] = record_name
            
            all_records_features.append(features_df)

    # Combine all the individual DataFrames into one
    if all_records_features:
        final_dataset = pd.concat(all_records_features, ignore_index=True)
        
        # Save the complete dataset to a single CSV file
        output_filename = 'all_records_features.csv'
        final_dataset.to_csv(output_filename, index=False)
        
        print(f"\n\nProcessing complete.")
        print(f"Successfully saved all features to '{output_filename}'")
        print(f"Total heartbeats processed: {len(final_dataset)}")
    else:
        print("\n\nProcessing complete. No data was generated.")


Processing record: 100...
Processing record: 101...
Processing record: 102...
Processing record: 103...
Processing record: 104...
Processing record: 105...
Processing record: 106...
Processing record: 107...
Processing record: 108...
Processing record: 109...
Processing record: 111...
Processing record: 112...
Processing record: 113...
Processing record: 114...
Processing record: 115...
Processing record: 116...
Processing record: 117...
Processing record: 118...
Processing record: 119...
Processing record: 121...
Processing record: 122...
Processing record: 123...
Processing record: 124...
Processing record: 200...
Processing record: 201...
Processing record: 202...
Processing record: 203...
Processing record: 205...
Processing record: 207...
Processing record: 208...
Processing record: 209...
Processing record: 210...
Processing record: 212...
Processing record: 213...
Processing record: 214...
Processing record: 215...
Processing record: 217...
Processing record: 219...
Processing r