In [42]:
import sys
sys.path.insert(0, '/Users/matthewashman/github/MasterProject2018')

# Import necessary modules. Set settings. Import data.
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import pywt
import math
from IPython.display import HTML

# For model building
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, recall_score, make_scorer
from sklearn import svm, naive_bayes, neighbors, gaussian_process
from sklearn.linear_model import LogisticRegression
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import RBF
from scipy.spatial.distance import euclidean
from sklearn.decomposition import PCA

# For feature extraction
from scipy.interpolate import CubicSpline      # for warping
from statsmodels.robust import mad
from tsfresh.feature_extraction import feature_calculators
from FeatureExtraction.feature_tools import detect_peaks
from sklearn.utils import resample
import fastdtw

# Miscelaneous
from IPython.display import display, clear_output
import pdb

plt.style.use('default')

training_data = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/training_data.pkl')
validation_data = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/validation_data.pkl')
test_data = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/test_data.pkl')
augmented_training_data_01 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/augmented_training_data_1.pkl')
augmented_training_data_02 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/augmented_training_data_2.pkl')
augmented_training_data_03 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/augmented_training_data_3.pkl')
augmented_training_data_04 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/augmented_training_data_4.pkl')
augmented_validation_data_01 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/augmented_validation_data_1.pkl')
augmented_validation_data_02 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/augmented_validation_data_2.pkl')
augmented_validation_data_03 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/augmented_validation_data_3.pkl')
augmented_validation_data_04 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/augmented_validation_data_4.pkl')
augmented_test_data_03 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/augmented_test_data_3.pkl')

In [44]:
print(augmented_test_data_03[augmented_test_data_03['Type']=='af']['Patient'].unique())

['12' '13' '14' '15' '17' '18' '19']


In [45]:
S2_training_data = training_data[training_data['S1/S2']=='S2']
S1_training_data = training_data[training_data['S1/S2']=='S1']
S2_validation_data = validation_data[validation_data['S1/S2']=='S2']
S1_validation_data = validation_data[validation_data['S1/S2']=='S1']
S2_test_data = test_data[test_data['S1/S2']=='S2']
S1_test_data = test_data[test_data['S1/S2']=='S1']

S2_augmented_training_data_01 = augmented_training_data_01[augmented_training_data_01['S1/S2']=='S2']
S1_augmented_training_data_01 = augmented_training_data_01[augmented_training_data_01['S1/S2']=='S1']
S2_augmented_training_data_02 = augmented_training_data_02[augmented_training_data_02['S1/S2']=='S2']
S1_augmented_training_data_02 = augmented_training_data_02[augmented_training_data_02['S1/S2']=='S1']
S2_augmented_training_data_03 = augmented_training_data_03[augmented_training_data_03['S1/S2']=='S2']
S1_augmented_training_data_03 = augmented_training_data_03[augmented_training_data_03['S1/S2']=='S1']
S2_augmented_training_data_04 = augmented_training_data_04[augmented_training_data_04['S1/S2']=='S2']
S1_augmented_training_data_04 = augmented_training_data_04[augmented_training_data_04['S1/S2']=='S1']

S2_augmented_validation_data_01 = augmented_validation_data_01[augmented_validation_data_01['S1/S2']=='S2']
S1_augmented_validation_data_01 = augmented_validation_data_01[augmented_validation_data_01['S1/S2']=='S1']
S2_augmented_validation_data_02 = augmented_validation_data_02[augmented_validation_data_02['S1/S2']=='S2']
S1_augmented_validation_data_02 = augmented_validation_data_02[augmented_validation_data_02['S1/S2']=='S1']
S2_augmented_validation_data_03 = augmented_validation_data_03[augmented_validation_data_03['S1/S2']=='S2']
S1_augmented_validation_data_03 = augmented_validation_data_03[augmented_validation_data_03['S1/S2']=='S1']
S2_augmented_validation_data_04 = augmented_validation_data_04[augmented_validation_data_04['S1/S2']=='S2']
S1_augmented_validation_data_04 = augmented_validation_data_04[augmented_validation_data_04['S1/S2']=='S1']

S2_augmented_test_data_03 = augmented_test_data_03[augmented_test_data_03['S1/S2']=='S2']
S1_augmented_test_data_03 = augmented_test_data_03[augmented_test_data_03['S1/S2']=='S1']

S2_data = pd.concat([S2_training_data, S2_validation_data, S2_test_data], axis=0, ignore_index=True)
S1_data = pd.concat([S2_training_data, S2_validation_data, S2_test_data], axis=0, ignore_index=True)

## Extracting Features

In [96]:
X_train_feature_list = []

for i,row in S2_training_data.iterrows():
    clear_output(wait=True)
    display('Extracting Training Features: ' + str(round(100*i/S2_training_data.index[-1],3)) + '%')    
    # Get the patients response to the first S1 stimuli as the reference response
    # Get typical response for this patient and channel
    # Bad apples
    typical_response = get_typical_response(row)
        
    # Normalise amplitudes with respect to the typical response amplitude.
    s1_response = typical_response['Data']/max(abs(typical_response['Data']))
    s2_response = row['Data']/max(abs(typical_response['Data']))
        
    ref_feature_dict = get_feature_dict(s1_response)
    feature_dict = get_feature_dict(s2_response)
    
    for k, v in list(feature_dict.items()):
        feature_dict[k + ' 2'] = v - ref_feature_dict[k]
        
    fdtw = fastdtw.dtw(s2_response, s1_response)
    feature_dict['DTW Distance'] = fdtw[0]
    
    # Fill in the other column values
    for col, value in row.iteritems():
        feature_dict[col] = value
        
    X_train_feature_list.append(feature_dict)

'Extracting Training Features: 100.0%'

In [103]:
X_train = pd.DataFrame(X_train_feature_list)
X_train.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_train.pkl')

In [97]:
X_validation_feature_list = []

for i,row in S2_validation_data.iterrows():
    clear_output(wait=True)
    display('Extracting Validation Features: ' + str(round(100*i/S2_validation_data.index[-1],3)) + '%')    
    # Get the patients response to the first S1 stimuli as the reference response
    # Get typical response for this patient and channel
    # Bad apples
    try:
        typical_response = get_typical_response(row)
    except:
        pdb.set_trace()
        print('WTF')
        
    # Normalise amplitudes with respect to the typical response amplitude.
    s1_response = typical_response['Data']/max(abs(typical_response['Data']))
    s2_response = row['Data']/max(abs(typical_response['Data']))
        
    ref_feature_dict = get_feature_dict(s1_response)
    feature_dict = get_feature_dict(s2_response)
    
    for k, v in list(feature_dict.items()):
        feature_dict[k + ' 2'] = v - ref_feature_dict[k]
        
    fdtw = fastdtw.dtw(s2_response, s1_response)
    feature_dict['DTW Distance'] = fdtw[0]
    
    # Fill in the other column values
    for col, value in row.iteritems():
        feature_dict[col] = value
        
    X_validation_feature_list.append(feature_dict)

'Extracting Validation Features: 100.0%'

In [104]:
X_validation = pd.DataFrame(X_validation_feature_list)
X_validation.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_validation.pkl')

In [98]:
X_train_aug_01_feature_list = []

for i,row in S2_augmented_training_data_01.iterrows():
    clear_output(wait=True)
    display('Extracting Augmented Training Features: ' + str(round(100*i/S2_augmented_training_data_01.index[-1],3)) + '%')    
    # Get the patients response to the first S1 stimuli as the reference response
    # Get typical response for this patient and channel
    # Bad apples
    typical_response = get_typical_response(row)
        
    # Normalise amplitudes with respect to the typical response amplitude.
    s1_response = typical_response['Data']/max(abs(typical_response['Data']))
    s2_response = row['Data']/max(abs(typical_response['Data']))
    
    ref_feature_dict = get_feature_dict(s1_response)
    feature_dict = get_feature_dict(s2_response)
    
    for k, v in list(feature_dict.items()):
        feature_dict[k + ' 2'] = v - ref_feature_dict[k]
        
    fdtw = fastdtw.dtw(s2_response, s1_response)
    feature_dict['DTW Distance'] = fdtw[0]
    
    # Fill in the other column values
    for col, value in row.iteritems():
        feature_dict[col] = value
        
    X_train_aug_01_feature_list.append(feature_dict)

'Extracting Augmented Training Features: 100.0%'

In [105]:
X_train_a = X_train.copy()
X_train_a['Augmented'] = 0

In [106]:
X_augmented_01 = pd.DataFrame(X_train_aug_01_feature_list)
X_augmented_01 = pd.concat([X_augmented_01, X_train_a], axis=0, ignore_index=True)
X_augmented_01.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_augmented_01.pkl')

In [99]:
X_train_aug_02_feature_list = []

for i,row in S2_augmented_training_data_02.iterrows():
    clear_output(wait=True)
    display('Extracting Augmented Training Features: ' + str(round(100*i/S2_augmented_training_data_02.index[-1],3)) + '%')    
    # Get the patients response to the first S1 stimuli as the reference response
    # Get typical response for this patient and channel
    # Bad apples
    typical_response = get_typical_response(row)
        
    # Normalise amplitudes with respect to the typical response amplitude.
    s1_response = typical_response['Data']/max(abs(typical_response['Data']))
    s2_response = row['Data']/max(abs(typical_response['Data']))
        
    ref_feature_dict = get_feature_dict(s1_response)
    feature_dict = get_feature_dict(s2_response)
    
    for k, v in list(feature_dict.items()):
        feature_dict[k + ' 2'] = v - ref_feature_dict[k]
        
    fdtw = fastdtw.dtw(s2_response, s1_response)
    feature_dict['DTW Distance'] = fdtw[0]
    
    # Fill in the other column values
    for col, value in row.iteritems():
        feature_dict[col] = value
        
    X_train_aug_02_feature_list.append(feature_dict)

'Extracting Augmented Training Features: 100.0%'

In [107]:
X_augmented_02 = pd.DataFrame(X_train_aug_02_feature_list)
X_augmented_02 = pd.concat([X_augmented_02, X_train_a], axis=0, ignore_index=True)
X_augmented_02.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_augmented_02.pkl')

In [100]:
X_train_aug_03_feature_list = []

for i,row in S2_augmented_training_data_03.iterrows():
    clear_output(wait=True)
    display('Extracting Augmented Training Features: ' + str(round(100*i/S2_augmented_training_data_03.index[-1],3)) + '%')    
    # Get the patients response to the first S1 stimuli as the reference response
    # Get typical response for this patient and channel
    # Bad apples
    typical_response = get_typical_response(row)
        
    # Normalise amplitudes with respect to the typical response amplitude.
    s1_response = typical_response['Data']/max(abs(typical_response['Data']))
    s2_response = row['Data']/max(abs(typical_response['Data']))
        
    ref_feature_dict = get_feature_dict(s1_response)
    feature_dict = get_feature_dict(s2_response)
    
    for k, v in list(feature_dict.items()):
        feature_dict[k + ' 2'] = v - ref_feature_dict[k]
        
    fdtw = fastdtw.dtw(s2_response, s1_response)
    feature_dict['DTW Distance'] = fdtw[0]
    
    # Fill in the other column values
    for col, value in row.iteritems():
        feature_dict[col] = value
        
    X_train_aug_03_feature_list.append(feature_dict)

'Extracting Augmented Training Features: 100.0%'

In [108]:
X_augmented_03 = pd.DataFrame(X_train_aug_03_feature_list)
X_augmented_03 = pd.concat([X_augmented_03, X_train_a], axis=0, ignore_index=True)
X_augmented_03.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_augmented_03.pkl')

In [101]:
X_train_aug_04_feature_list = []

for i,row in S2_augmented_training_data_04.iterrows():
    clear_output(wait=True)
    display('Extracting Augmented Training Features: ' + str(round(100*i/S2_augmented_training_data_04.index[-1],3)) + '%')    
    # Get the patients response to the first S1 stimuli as the reference response
    # Get typical response for this patient and channel
    # Bad apples
    typical_response = get_typical_response(row)
        
    # Normalise amplitudes with respect to the typical response amplitude.
    s1_response = typical_response['Data']/max(abs(typical_response['Data']))
    s2_response = row['Data']/max(abs(typical_response['Data']))
        
    ref_feature_dict = get_feature_dict(s1_response)
    feature_dict = get_feature_dict(s2_response)
    
    for k, v in list(feature_dict.items()):
        feature_dict[k + ' 2'] = v - ref_feature_dict[k]
        
    fdtw = fastdtw.dtw(s2_response, s1_response)
    feature_dict['DTW Distance'] = fdtw[0]
    
    # Fill in the other column values
    for col, value in row.iteritems():
        feature_dict[col] = value
        
    X_train_aug_04_feature_list.append(feature_dict)

'Extracting Augmented Training Features: 100.0%'

In [109]:
X_augmented_04 = pd.DataFrame(X_train_aug_04_feature_list)
X_augmented_04 = pd.concat([X_augmented_04, X_train_a], axis=0, ignore_index=True)
X_augmented_04.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_augmented_04.pkl')

In [113]:
X_val_aug_03_feature_list = []

for i,row in S2_augmented_validation_data_03.iterrows():
    clear_output(wait=True)
    display('Extracting Augmented Validation Features: ' + str(round(100*i/S2_augmented_validation_data_03.index[-1],3)) + '%')    
    # Get the patients response to the first S1 stimuli as the reference response
    # Get typical response for this patient and channel
    # Bad apples
    typical_response = get_typical_response(row)
        
    # Normalise amplitudes with respect to the typical response amplitude.
    s1_response = typical_response['Data']/max(abs(typical_response['Data']))
    s2_response = row['Data']/max(abs(typical_response['Data']))
        
    ref_feature_dict = get_feature_dict(s1_response)
    feature_dict = get_feature_dict(s2_response)
    
    for k, v in list(feature_dict.items()):
        feature_dict[k + ' 2'] = v - ref_feature_dict[k]
        
    fdtw = fastdtw.dtw(s2_response, s1_response)
    feature_dict['DTW Distance'] = fdtw[0]
    
    # Fill in the other column values
    for col, value in row.iteritems():
        feature_dict[col] = value
        
    X_val_aug_03_feature_list.append(feature_dict)

'Extracting Augmented Validation Features: 100.0%'

In [114]:
X_validation_a = X_validation.copy()
X_validation_a['Augmented'] = 0

In [115]:
X_validation_augmented_03 = pd.DataFrame(X_val_aug_03_feature_list)
X_validation_augmented_03 = pd.concat([X_validation_augmented_03, X_validation_a], axis=0, ignore_index=True)
X_validation_augmented_03.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_validation_augmented_03.pkl')

In [30]:
X_test_feature_list = []

for i,row in S2_test_data.iterrows():
    clear_output(wait=True)
    display('Extracting Test Features: ' + str(round(100*i/S2_test_data.index[-1],3)) + '%')    
    # Get the patients response to the first S1 stimuli as the reference response
    # Get typical response for this patient and channel
    # Bad apples
    typical_response = get_typical_response(row)
        
    # Normalise amplitudes with respect to the typical response amplitude.
    s1_response = typical_response['Data']/max(abs(typical_response['Data']))
    s2_response = row['Data']/max(abs(typical_response['Data']))
        
    ref_feature_dict = get_feature_dict(s1_response)
    feature_dict = get_feature_dict(s2_response)
    
    for k, v in list(feature_dict.items()):
        feature_dict[k + ' 2'] = v - ref_feature_dict[k]
        
    fdtw = fastdtw.dtw(s2_response, s1_response)
    feature_dict['DTW Distance'] = fdtw[0]
    
    # Fill in the other column values
    for col, value in row.iteritems():
        feature_dict[col] = value
        
    X_test_feature_list.append(feature_dict)

'Extracting Test Features: 100.0%'

In [31]:
X_test = pd.DataFrame(X_test_feature_list)
X_test.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_test.pkl')

In [46]:
X_test_aug_03_feature_list = []

for i,row in S2_augmented_test_data_03.iterrows():
    clear_output(wait=True)
    display('Extracting Augmented Test Features: ' + str(round(100*i/S2_augmented_test_data_03.index[-1],3)) + '%')    
    # Get the patients response to the first S1 stimuli as the reference response
    # Get typical response for this patient and channel
    # Bad apples
    typical_response = get_typical_response(row)
        
    # Normalise amplitudes with respect to the typical response amplitude.
    s1_response = typical_response['Data']/max(abs(typical_response['Data']))
    s2_response = row['Data']/max(abs(typical_response['Data']))
        
    ref_feature_dict = get_feature_dict(s1_response)
    feature_dict = get_feature_dict(s2_response)
    
    for k, v in list(feature_dict.items()):
        feature_dict[k + ' 2'] = v - ref_feature_dict[k]
        
    fdtw = fastdtw.dtw(s2_response, s1_response)
    feature_dict['DTW Distance'] = fdtw[0]
    
    # Fill in the other column values
    for col, value in row.iteritems():
        feature_dict[col] = value
        
    X_test_aug_03_feature_list.append(feature_dict)

'Extracting Augmented Test Features: 100.0%'

In [47]:
X_test_a = X_test.copy()
X_test_a['Augmented'] = 0

In [48]:
X_test_augmented_03 = pd.DataFrame(X_test_aug_03_feature_list)
X_test_augmented_03 = pd.concat([X_test_augmented_03, X_test_a], axis=0, ignore_index=True)
X_test_augmented_03.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_test_augmented_03.pkl')

In [18]:
def get_typical_response(row):
    if (((row['Type'] + row['Patient']) == 'af8') & (row['Channel'] == 'CS5-6')):
        typical_response = S1_data[(S1_data['Type']==row['Type']) & 
                           (S1_data['Patient']==row['Patient']) &
                           (S1_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[2]
    elif (((row['Type'] + row['Patient']) == 'af18') & (row['Channel'] == 'CS5-6')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'avrt14') & (row['Channel'] == 'CS5-6')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'avrt14') & (row['Channel'] == 'CS3-4')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'avrt18')):
        typical_response = S1_data[(S1_data['Type']==row['Type']) & 
                           (S1_data['Patient']==row['Patient']) &
                           (S1_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[2]
    elif (((row['Type'] + row['Patient']) == 'avnrt26') & (row['Channel'] == 'CS1-2')):
        typical_response = S1_data[(S1_data['Type']==row['Type']) & 
                           (S1_data['Patient']==row['Patient']) &
                           (S1_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[1]
    elif (((row['Type'] + row['Patient']) == 'avnrt26') & (row['Channel'] != 'CS1-2')):
        typical_response = S1_data[(S1_data['Type']==row['Type']) & 
                           (S1_data['Patient']==row['Patient']) &
                           (S1_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[3]
    elif (((row['Type'] + row['Patient']) == 'avnrt27')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'avnrt31') & (row['Channel'] == 'CS1-2')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'avnrt33') & (row['Channel'] == 'CS3-4')):
        typical_response = S1_data[(S1_data['Type']==row['Type']) & 
                           (S1_data['Patient']==row['Patient']) &
                           (S1_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[2]
    elif (((row['Type'] + row['Patient']) == 'at1') & (row['Channel'] == 'CS1-2')):
        typical_response = S1_data[(S1_data['Type']==row['Type']) & 
                           (S1_data['Patient']==row['Patient']) &
                           (S1_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[4]
    elif (((row['Type'] + row['Patient']) == 'at3') & (row['Channel'] == 'CS3-4')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'avnrt10') & (row['Channel'] == 'CS1-2')):
        typical_response = S1_data[(S1_data['Type']==row['Type']) & 
                           (S1_data['Patient']==row['Patient']) &
                           (S1_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[1]
    elif (((row['Type'] + row['Patient']) == 'avnrt15') & (row['Channel'] == 'CS3-4')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'avnrt16') & (row['Channel'] == 'CS5-6')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'avnrt1') & (row['Channel'] == 'CS1-2')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[1]
    elif (((row['Type'] + row['Patient']) == 'avrt13') & (row['Channel'] == 'CS1-2')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'avrt13')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'af14')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'af15')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'ep6') & (row['Channel'] == 'CS5-6')):
        typical_response = S1_data[(S1_data['Type']==row['Type']) & 
                           (S1_data['Patient']==row['Patient']) &
                           (S1_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[1]
    elif (((row['Type'] + row['Patient']) == 'ep3') & (row['Channel'] == 'CS3-4')):
        typical_response = S2_data[(S2_data['Type']==row['Type']) & 
                           (S2_data['Patient']==row['Patient']) &
                           (S2_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'af9') & (row['Channel'] == 'CS5-6')):
        typical_response = S1_data[(S1_data['Type']==row['Type']) & 
                           (S1_data['Patient']==row['Patient']) &
                           (S1_data['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[1]
    else:
        typical_response = S1_data[(S1_data['Type']==row['Type']) & 
                               (S1_data['Patient']==row['Patient']) &
                               (S1_data['Channel']==row['Channel'])
                               ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    return typical_response

In [7]:
# A shitty conduction delay detector
def get_delay(x, amp_thresh=None, set_thresh=False):
    if (set_thresh==True):
        if any(abs(x)>amp_thresh):
            return np.argmax(abs(x)>amp_thresh)
        else:
            return len(x)
    else:    
        return np.argmax(abs(x)>(max(abs(x))/2))
    
def denoise(x):
    # Obtain Daubechies N=6 wavelet coefficients
    waveletCoefs = pywt.wavedec(x, 'db7', mode='per')

    # Throw away coefficients corresponding to noise
    sigma = mad(waveletCoefs[-1])
    uThresh = 1*sigma*np.sqrt(2*np.log(len(x)))
    denoised = waveletCoefs[:]
    denoised[1:] = (pywt._thresholding.hard(i, value=uThresh) for i in denoised[1:])

    # Reconstruct the original signal
    xDenoised = pywt.waverec(denoised, 'db7', mode='per')

    return xDenoised

def get_peaks(x, height_thresh, scale_amp=None, set_scale=False, plot = False):
    x = np.array(x)
    
    # Get height_thresh
    if set_scale:
        height_thresh = height_thresh*scale_amp
    else:
        height_thresh = height_thresh*max(abs(x))
    
    # Denoise x
    xdn = denoise(x)

    # Detect peaks using detect_peaks
    pos_peak_idx = detect_peaks(xdn, mph=height_thresh, threshold = 0)
    neg_peak_idx = detect_peaks((-xdn), mph=height_thresh, threshold = 0)
    peak_idx = np.concatenate([pos_peak_idx, neg_peak_idx])
    peak_idx = np.sort(peak_idx)
    # Edge indeces aren't detected
    peak_idx = peak_idx[(peak_idx != 0) & (peak_idx != (len(xdn)-1))]

    new_peak_idx = []
    peak_amp = []
    if (len(peak_idx) > 0):
        new_peak_idx.append(peak_idx[0])
        mp_thresh = 0.2*max(abs(x))
        for i in range(len(peak_idx)-1):
            idx = peak_idx[i]
            idx_next = peak_idx[i+1]
            mid_point = int((idx_next+idx)/2)
            if (max([abs(x[idx_next]-x[mid_point]), abs(x[idx]-x[mid_point])]) > mp_thresh):
                new_peak_idx.append(idx_next)

        peak_idx = np.array(new_peak_idx)
        peak_amp = x[peak_idx]

    if plot == True:
        fig, [ax1] = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(8,8))
        ax1.plot(x, 'b' , xdn, 'r--', peak_idx, peak_amp, 'kx')
        #plt.title(fileName)
        ax1.set_xlabel('Sample')
        ax1.set_ylabel('Normalised amplitude')
        ax1.legend(['Original segment', 'Denoised segment', 'Detected peaks'])

        plt.draw()
        plt.waitforbuttonpress(0) # this will wait for indefinite time
        plt.close(fig)


    return peak_idx, peak_amp

def sample_entropy(U, m, r):

    def _maxdist(x_i, x_j):
        result = max([abs(ua-va) for ua, va in zip(x_i, x_j)])
        return result

    def _phi(m):
        x = np.zeros([N,m-1])
        for i in range(N-m+1):
            x[i,:] = U[i:i+m-1]

        C = 0
        for i in range(len(x)):
            for j in range(len(x)):
                if i != j:
                    if _maxdist(x[i,:], x[j,:]) <= r:
                        C = C + 1

        return C

    U = U/max(abs(U))
    N = len(U)

    return -np.log(_phi(m+1)/_phi(m))

def percentage_fractionation(x, peak_idxs, thresh=0.01, sr=1000):
    # Get peak indexes and amplitude
    peak_idx_diffs = np.diff(peak_idxs)
    frac_time = 0
    frac_time = np.sum(peak_idx_diffs[peak_idx_diffs < thresh*sr])
    prcnt_frac = (frac_time/len(x))*100
    return prcnt_frac

def get_local_sample_entropy(x, centre_idx, width, m=2, r=0.05):
    # Ensure width is odd
    if ((width%2) == 0):
        width += 1
        
    if (centre_idx < (width-1)/2):
        return sample_entropy(x[:width+1], m, r)
    elif (centre_idx > (len(x)-1-(width-1)/2)):
        return sample_entropy(x[len(x)-1-width:], m, r)
    else:
        return sample_entropy(x[int(centre_idx-(width-1)/2):int(centre_idx+(width+1)/2)], m, r)
    
def get_location_of_max_energy(x, M=14):
    v = np.ones(M)
    x_ = np.convolve(abs(x), v)
    return (np.argmax(x_) + math.floor(M/2))
        
def get_local_peaks(x, centre_idx, width=25, height_thresh=0.1):
    if ((width%2) == 0):
        width += 1
        
    if (centre_idx < (width-1)/2):
        return get_peaks(x[:width+1], height_thresh)
    elif (centre_idx > (len(x)-1-(width-1)/2)):
        return get_peaks(x[len(x)-1-width:], height_thresh)
    else:
        return get_peaks(x[int(centre_idx-(width-1)/2):int(centre_idx+(width+1)/2)], height_thresh)
    
def get_pse(x):
    x_fft = np.fft.rfft(x)
    x_P = (1/len(x_fft))*np.absolute(x_fft)**2
    x_p = x_P/sum(x_P)
    pse = np.sum([(-p*np.log2(p)) for p in x_p])
    return pse

def get_local_pse(x, centre_idx, width=50):
    if ((width%2) == 0):
        width += 1
        
    if (centre_idx < (width-1)/2):
        return get_pse(x[:width+1])
    elif (centre_idx > (len(x)-1-(width-1)/2)):
        return get_pse(x[len(x)-1-width:])
    else:
        return get_pse(x[int(centre_idx-(width-1)/2):int(centre_idx+(width+1)/2)])
    
def get_spectral_centroid(x):
    x_fft = np.fft.rfft(x)
    x_spectrum = np.absolute(x_fft)
    normalized_spectrum = x_spectrum/sum(x_spectrum)
    normalized_frequencies = np.arange(0, len(x_spectrum), 1)
    return sum(normalized_frequencies * normalized_spectrum)

def get_local_spectral_centroid(x, centre_idx, width=50):
    if ((width%2) == 0):
        width += 1
        
    if (centre_idx < (width-1)/2):
        return get_spectral_centroid(x[:width+1])
    elif (centre_idx > (len(x)-1-(width-1)/2)):
        return get_spectral_centroid(x[len(x)-1-width:])
    else:
        return get_spectral_centroid(x[int(centre_idx-(width-1)/2):int(centre_idx+(width+1)/2)])
    
def get_local_energy(x, centre_idx, width=60):
    if ((width%2) == 0):
        width += 1
        
    if (centre_idx < (width-1)/2):
        return np.sum(x[:width+1]**2)
    elif (centre_idx > (len(x)-1-(width-1)/2)):
        return np.sum(x[len(x)-1-width:]**2)
    else:
        return np.sum(x[int(centre_idx-(width-1)/2):int(centre_idx+(width+1)/2)]**2)
    
def get_width_max_energy(x, M=14, width_thresh=0.2):
    v = np.ones(M)
    x_ = np.convolve(abs(x), v)
    if any(x_[np.argmax(x_):] < width_thresh*np.max(x_)):
        end_idx = np.argmax(x_) + np.argmax(x_[np.argmax(x_):] < width_thresh*np.max(x_))
    else:
        end_idx = len(x_)-1
    if any(x_[np.argmax(x_)::-1] < width_thresh*np.max(x_)):  
        start_idx = np.argmax(x_) - np.argmax(x_[np.argmax(x_)::-1] < width_thresh*np.max(x_))
    else:
        start_idx = 0

    return (end_idx - start_idx)

In [8]:
def get_feature_dict(x, col_prefix=''):
    feature_dict = {}
    height_thresh=0.1
        
    # Hand engineered features
    peaks = get_peaks(x, height_thresh)
    feature_dict[col_prefix + 'Number of Peaks'] = len(peaks[0])
    feature_dict[col_prefix + 'Percentage Fractionation'] = percentage_fractionation(x, peaks[0], thresh=0.01)
    
    
    max_energy_idx = get_location_of_max_energy(x)
    feature_dict[col_prefix + 'Location of Maximum Energy'] = max_energy_idx
    feature_dict[col_prefix + 'Sample Entropy Around Max Energy'] = get_local_sample_entropy(x, max_energy_idx, 30, m=3, r=0.15)
    feature_dict[col_prefix + 'Width of Maximum Energy'] = get_width_max_energy(x, M=14, width_thresh=0.2)
    
    # Temporal features
    feature_dict[col_prefix + 'Ratio Above 1xSTD'] = feature_calculators.ratio_beyond_r_sigma(x, 1)
    feature_dict[col_prefix + 'Mean Absolute Value'] = np.mean(abs(x)/max(abs(x)))
    
    return feature_dict