In [59]:
# General
! pip install biosppy
! pip install neurokit2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import biosppy
from biosppy.signals import ecg
import neurokit2 as nk
from pprint import pprint

# ML
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

# Custom
import sys,os
sys.path.append( '.' )
sys.path.append( '..' )
import Components.Outlier_Detection as Outlier_Detection
import Components.Feature_Selection as Feature_Selection
import Components.Normalisation as Normalisation
import Components.data_fetching as data_fetching


# CAREFUL:
# If you make changes to a custom module, you have to reload it, i.e rerun this cell
import importlib
importlib.reload(Outlier_Detection)
importlib.reload(Feature_Selection)
importlib.reload(Normalisation)
importlib.reload(data_fetching)

Collecting neurokit2
  Downloading neurokit2-0.0.41-py2.py3-none-any.whl (983 kB)
[K     |████████████████████████████████| 983 kB 10.1 MB/s eta 0:00:01
Installing collected packages: neurokit2
Successfully installed neurokit2-0.0.41


<module 'Components.data_fetching' from '../Components/data_fetching.py'>

# Data

In [4]:
X, y = data_fetching.get_train_data()
#x_test = data_fetching.get_test_data()

# Feature extraction

In [60]:
def peaks_cleaning(peaks):
    # This method identifies tuples <p,q,r,s,t> which represent a single valid heartbeat,
        # It therefore removes cases such as <_,_,r,s,t>, where p and q are missing
        # It is robust to potential errors in the middle of the peak arrays
    # Return:
        # It return an array, A, of equal length arrays of peak location for p,q,r,s,t peaks
        # A[:][i] represents a valid heartbeat tuple <p,q,r,s,t>

    # median r_peak distance used later to check if the p_peak r_peak distance in valid
    med = np.median(np.diff(peaks[0])) # TODO: MAKE SURE THIS IS VALID
    
    # Obtain valid <p,q,r,t> pairs
    # assume p,q,r,t each ordered asc.
        # for each possible <p,q,r,t>
            # if p>q,r,t then we can never for a valid pair with q,r,t so drop q,r,t
                # repeat until p<q,r,t
                # same with p<q>r,t, p<q<r>t,
            # cont until p<q<r<t, then check that p<q<r<t is feasible
    i = -1
    while i < len(peaks[0]):
        i += 1
        if i <= np.min(list(map(len, peaks)))-1:
            for j in range(len(peaks)-1):
                while (peaks[j][i] > peaks[j+1][i]):
                    peaks[j+1].pop(i)
            if peaks[-1][i] - peaks[0][i] > med:
                peaks[0].pop(i)
                i -= 1
        else:
            for i in range(len(peaks)):
                peaks[i] = peaks[i][:np.min(list(map(len, peaks)))]
            break
    return peaks

In [97]:
def feature_extraction(peaks):

    features = []
    
    bpm = 18000 / np.mean(np.diff(peaks[2]))
    features.append(bpm)
    
    ######################
    # Single-peak measures
        # For the single peak measure we wan't all the data that ecg_process found
    # Variablity measures: single peak type
    for i in range(len(peaks)):
        peak = peaks[i]
        diff = np.diff(peak)
        features.append(np.mean(peak))
        features.append(np.var(peak))
        features.append(np.mean(diff))
        features.append(np.var(diff))
        
    #####################
    # Multi-peak measures
        # For the multi-peak measure we require the combination of each peak identified at index i
        # to repr
    peaks = peaks_cleaning(peaks)
    
    # Variablity measures: multi peak types
    for i in range(len(peaks)):
        for j in range(len(peaks)):
            diff = np.array(peaks[i])-np.array(peaks[j])
            features.append(np.mean(diff))
            features.append(np.var(diff))
            
    return features

In [101]:
ecg_signal = X.iloc[0].dropna()

# Fetch peak arrays: [p,q,r,s,t]
signals,_ = nk.ecg_process(signal, sampling_rate=300)
peaks = [list(signals.index[signals[peak_type]==1]) for peak_type in ["ECG_P_Peaks","ECG_Q_Peaks","ECG_R_Peaks","ECG_S_Peaks","ECG_T_Peaks"]]

pprint(peaks)
f = feature_extraction(peaks)

print(len(f))

[[141, 405, 658, 1476, 1746, 2009, 2267, 2522, 2780, 3026],
 [170, 431, 678, 954, 1498, 1761, 2031, 2291, 2554, 2803, 3046],
 [183, 446, 706, 969, 1237, 1517, 1783, 2050, 2306, 2566, 2817, 3068],
 [209, 465, 746, 998, 1257, 1536, 1809, 2069, 2326, 2583, 2839, 3087],
 [288, 502, 803, 1084, 1289, 1572, 1852, 2109, 2685, 2872, 3124]]
71


# Testing

In [102]:
import copy as cp

test = [[141, 405, 658, 1476, 1746, 2009, 2267, 2522, 2780, 3026],
 [170, 431, 678, 954, 1498, 1761, 2031, 2291, 2554, 2803, 3046],
 [183, 446, 706, 969, 1237, 1517, 1783, 2050, 2306, 2566, 2817, 3068],
 [209, 465, 746, 998, 1257, 1536, 1809, 2069, 2326, 2583, 2839, 3087],
 [288, 502, 803, 1084, 1289, 1572, 1852, 2109, 2685, 2872, 3124]]

print("Test 1:")
t1 = cp.deepcopy(test)
t1[0][0] = None
pprint(t1)
t1[0].pop(0)
pprint(peaks_cleaning(t1))

print("\nTest 2:")
t2 = cp.deepcopy(test)
t2[2][5] = None
pprint(t2)
t2[2].pop(5)
pk2 = peaks_cleaning(t2)
pprint(pk2)

# p_t2 = peaks_cleaning(t2)
# print(np.array(p_t2[-1])-np.array(p_t2[0]))

Test 1:
[[None, 405, 658, 1476, 1746, 2009, 2267, 2522, 2780, 3026],
 [170, 431, 678, 954, 1498, 1761, 2031, 2291, 2554, 2803, 3046],
 [183, 446, 706, 969, 1237, 1517, 1783, 2050, 2306, 2566, 2817, 3068],
 [209, 465, 746, 998, 1257, 1536, 1809, 2069, 2326, 2583, 2839, 3087],
 [288, 502, 803, 1084, 1289, 1572, 1852, 2109, 2685, 2872, 3124]]
[[405, 658, 1476, 1746, 2009, 2522, 2780, 3026],
 [431, 678, 1498, 1761, 2031, 2554, 2803, 3046],
 [446, 706, 1517, 1783, 2050, 2566, 2817, 3068],
 [465, 746, 1536, 1809, 2069, 2583, 2839, 3087],
 [502, 803, 1572, 1852, 2109, 2685, 2872, 3124]]

Test 2:
[[141, 405, 658, 1476, 1746, 2009, 2267, 2522, 2780, 3026],
 [170, 431, 678, 954, 1498, 1761, 2031, 2291, 2554, 2803, 3046],
 [183, 446, 706, 969, 1237, None, 1783, 2050, 2306, 2566, 2817, 3068],
 [209, 465, 746, 998, 1257, 1536, 1809, 2069, 2326, 2583, 2839, 3087],
 [288, 502, 803, 1084, 1289, 1572, 1852, 2109, 2685, 2872, 3124]]
[[141, 405, 658, 1746, 2009, 2522, 2780, 3026],
 [170, 431, 678, 1761, 