In [11]:
import csv
import os

import biosppy.signals.ecg as ecg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [13]:
dataRaw = pd.read_csv('data/train.csv', index_col='id')

In [94]:
# Show the data
data = dataRaw
data

Unnamed: 0_level_0,y,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x17797,x17798,x17799,x17800,x17801,x17802,x17803,x17804,x17805,x17806
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,-13,-9,-6,-4,0,2,6,12,23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,-34,110,249,390,527,639,721,777,823,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,-34,-36,-37,-39,-41,-42,-44,-46,-48,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,292,298,303,310,320,336,354,377,405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,157,179,195,210,217,222,226,228,231,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5112,3,-247,-271,-285,-303,-334,-376,-413,-432,-443,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5113,0,62,62,61,61,61,61,61,61,61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5114,0,-95,-110,-124,-131,-126,-114,-95,-67,-42,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5115,0,-50,-48,-45,-42,-38,-35,-32,-30,-28,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Helper functions

In [91]:
# Function to extract r-peaks of single ECG signal
# Discards NaN values
def extract_r_peaks(signal, sampling_rate=300):
    signal = signal.dropna().to_numpy(dtype='float32')
    try:
        r_peaks = ecg.engzee_segmenter(signal, sampling_rate=sampling_rate)['rpeaks']
        return r_peaks.tolist()
    except ValueError as e:
        print(f"Error processing row: {e}")
        return []

In [92]:
# Function to extract heartbeats for a single signal, given the signal and r_peaks
def extract_heartbeats(signal, r_peaks, sampling_rate=300):
    try:
        heartbeats = ecg.extract_heartbeats(signal, r_peaks, sampling_rate=sampling_rate)
        return heartbeats
    except ValueError as e:
        print(f"Error in heartbeat extraction: {e}")
        return []

In [93]:
# Full heartbeat extraction pipeline
# Assumes given data has no NaN values
def process_ecg_dataset(data, sampling_rate=300):
    r_peaks_list = []
    heartbeats_list = []

    for i in range(len(data)):
        r_peaks = extract_r_peaks(data.loc[i], sampling_rate)
        r_peaks_list.append(r_peaks)

        heartbeats = extract_heartbeats(data.loc[i], r_peaks, sampling_rate)['templates']
        heartbeats_list.append(heartbeats)

    return r_peaks_list, heartbeats_list

## Data processing

In [95]:
r_peaks_list, heartbeats_list = process_ecg_dataset(data)

In [96]:
# Process data and save it since processing takes a looong time
result_data = pd.DataFrame({
    'y': data['y'],
    'heartbeat': heartbeats_list
})
result_data.to_csv('data/processed_train_dataset.csv', index=False)