In [3]:
# General
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


# ML

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, SelectFromModel
from sklearn.svm import LinearSVC

# feature extraction
import biosppy.signals.ecg as ecg
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import feature_extraction as tsfe
from tsfresh import extract_relevant_features


# Custom
import sys,os
sys.path.append( '.' )
sys.path.append( '..' )
import Components.Outlier_Detection as Outlier_Detection
import Components.Feature_Selection as Feature_Selection
import Components.Normalisation as Normalisation
import Components.data_fetching as data_fetching
import Components.feature_extraction as feature_extraction

# CAREFUL:
# If you make changes to a custom module, you have to reload it, i.e rerun this cell
import importlib
importlib.reload(Outlier_Detection)
importlib.reload(Feature_Selection)
importlib.reload(Normalisation)
importlib.reload(data_fetching)
importlib.reload(feature_extraction)

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets


## Data Preprocessing

### Data Input

In [4]:
X, y = data_fetching.get_train_data()
x_test = data_fetching.get_test_data()


In [5]:
X_train_new = []
for index, row in X.iterrows(): 
    _, _, peaks, _, templates, _, _ = ecg.ecg(signal=row.dropna(), sampling_rate=300.0, show=False)
    #get one averaged heartbeat template for each time series
    average = np.mean(templates, axis=0)
    #calculate the variances of the heartbeat templates for a selected number of points (evenly distributed)
    sel_templates = templates[np.round(np.linspace(0, len(templates)-1, 20)).astype(int)]
    variances = np.var(sel_templates,axis=0)
    #calculate the distances between r-peaks
    peaks_distances = np.diff(peaks)
    mean_peaks_distances = np.mean(peaks_distances)
    var_peaks_distances = np.var(peaks_distances)
    features = np.concatenate([average,variances,[mean_peaks_distances,var_peaks_distances]])
    X_train_new.append(features)
X_train_new = pd.DataFrame(X_train_new)

store the selected features

In [6]:
X_train_new.to_csv('../../Data/heartbeat_feat_train.csv', index_label='id')

calculate the same features for the test data and store it

In [7]:
X_test_new = []
for index, row in x_test.iterrows(): 
    _, _, peaks, _, templates, _, _ = ecg.ecg(signal=row.dropna(), sampling_rate=300.0, show=False)
    #get one averaged heartbeat template for each time series
    average = np.mean(templates, axis=0)
    #calculate the variances of the heartbeat templates for a selected number of points (evenly distributed)
    sel_templates = templates[np.round(np.linspace(0, len(templates)-1, 20)).astype(int)]
    variances = np.var(sel_templates,axis=0)
    #calculate the distances between r-peaks
    peaks_distances = np.diff(peaks)
    mean_peaks_distances = np.mean(peaks_distances)
    var_peaks_distances = np.var(peaks_distances)
    features = np.concatenate([average,variances,[mean_peaks_distances,var_peaks_distances]])
    X_test_new.append(features)
X_test_new = pd.DataFrame(X_test_new)

In [8]:
X_test_new.to_csv('../../Data/heartbeat_feat_test.csv', index_label='id')